Percy3822 commited on
Commit
9d33e4a
·
verified ·
1 Parent(s): 90ba8a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -131
app.py CHANGED
@@ -1,152 +1,247 @@
1
- import os, io, time, json, asyncio, tempfile, wave, uuid, shlex
2
- from typing import Optional
3
-
4
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Body
5
- from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
6
- from fastapi.middleware.cors import CORSMiddleware
7
- import anyio
8
-
9
- APP_NAME = "ActualTTS (espeak-ng)"
10
- BASE_DIR = "/tmp/actual_tts"
11
- FILES_DIR = os.path.join(BASE_DIR, "files")
12
- os.makedirs(FILES_DIR, exist_ok=True)
13
-
14
- DEFAULT_VOICE = "en-us" # espeak voice id
15
- DEFAULT_RATE_WPM = 170 # speaking speed
16
-
17
- app = FastAPI(title=APP_NAME)
18
-
19
- # CORS: allow local client
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_methods=["*"],
24
- allow_headers=["*"],
 
 
 
 
 
 
 
 
25
  )
26
 
27
- @app.get("/", response_class=PlainTextResponse)
28
- def root():
29
- return "OK"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  @app.get("/health")
32
  def health():
33
  return {
34
  "ok": True,
35
- "engine": "espeak-ng",
36
  "default_voice": DEFAULT_VOICE,
37
- "files_dir": FILES_DIR,
38
- "tip": "WebSocket /ws/tts (init, then speak), or POST /speak",
 
 
39
  }
40
 
41
- def espeak_cmd(text: str, voice: str = DEFAULT_VOICE, rate_wpm: int = DEFAULT_RATE_WPM):
42
- # --stdout makes espeak-ng write a valid WAV to stdout
43
- # -v voice, -s speed(WPM)
44
- # We quote text via shell=False + pass list args
45
- return ["espeak-ng", "--stdout", "-v", voice, "-s", str(rate_wpm), text]
46
-
47
- async def synth_to_file(text: str, voice: str, rate_wpm: int) -> str:
48
- """Run espeak-ng once, capture its WAV, write to a file, return path."""
49
- tmp_path = os.path.join(FILES_DIR, f"tts-{int(time.time()*1000)}.wav")
50
- proc = await asyncio.create_subprocess_exec(
51
- *espeak_cmd(text, voice, rate_wpm),
52
- stdout=asyncio.subprocess.PIPE,
53
- stderr=asyncio.subprocess.PIPE,
54
- )
55
- wav_bytes, err = await proc.communicate()
56
- if proc.returncode != 0 or not wav_bytes:
57
- raise RuntimeError(f"espeak-ng failed rc={proc.returncode}, err={err.decode('utf-8','ignore')}")
58
- with open(tmp_path, "wb") as f:
59
- f.write(wav_bytes)
60
- return tmp_path
61
-
62
  @app.post("/speak")
63
- async def speak_post(payload: dict = Body(...)):
64
- text: str = payload.get("text", "")
65
- voice: str = payload.get("voice") or DEFAULT_VOICE
66
- rate_wpm: int = int(payload.get("rate_wpm", DEFAULT_RATE_WPM))
67
- if not text.strip():
68
- return JSONResponse({"ok": False, "error": "no text"}, status_code=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- path = await synth_to_file(text, voice, rate_wpm)
71
  except Exception as e:
72
- return JSONResponse({"ok": False, "error": "Synthesis failed", "detail": str(e)}, status_code=500)
73
- rel = f"/file/{os.path.basename(path)}"
74
- return {"ok": True, "audio_url": rel}
75
-
76
- @app.get("/file/{fname}")
77
- async def get_file(fname: str):
78
- path = os.path.join(FILES_DIR, fname)
79
- if not os.path.isfile(path):
80
- return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
81
- return FileResponse(path, media_type="audio/wav")
82
-
83
- # --------- WebSocket Streaming TTS ----------
84
- # Protocol:
85
- # Client sends: {"event":"init","voice":"en-us","rate_wpm":170}
86
- # Server replies: {"event":"ready","sr":22050}
87
- # Client sends: {"event":"speak","text":"..."}
88
- # Server streams: binary frames with WAV bytes as they are produced
89
- # then {"event":"done"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  @app.websocket("/ws/tts")
91
  async def ws_tts(ws: WebSocket):
 
 
 
 
 
 
 
 
92
  await ws.accept()
93
- voice = DEFAULT_VOICE
94
- rate_wpm = DEFAULT_RATE_WPM
 
 
 
95
  try:
96
- # Expect init first
97
- first = await ws.receive_text()
98
- msg = json.loads(first)
99
- if msg.get("event") != "init":
100
- await ws.send_text(json.dumps({"event":"error","detail":"first message must be {'event':'init',...}"}))
101
- await ws.close(code=1002)
102
- return
103
- if "voice" in msg and msg["voice"]:
104
- voice = msg["voice"]
105
- if "rate_wpm" in msg:
106
- try:
107
- rate_wpm = int(msg["rate_wpm"])
108
- except:
109
- pass
110
-
111
- # Tell client our sample-rate. espeak-ng emits 22050 Hz PCM.
112
- await ws.send_text(json.dumps({"event":"ready","sr":22050}))
113
-
114
- # Wait for speak
115
- nxt = await ws.receive_text()
116
- data = json.loads(nxt)
117
- if data.get("event") != "speak" or not data.get("text"):
118
- await ws.send_text(json.dumps({"event":"error","detail":"need {'event':'speak','text':...}"}))
119
- await ws.close()
120
- return
121
-
122
- text = data["text"]
123
-
124
- # Spawn espeak-ng and stream its stdout as binary chunks
125
- proc = await asyncio.create_subprocess_exec(
126
- *espeak_cmd(text, voice, rate_wpm),
127
- stdout=asyncio.subprocess.PIPE,
128
- stderr=asyncio.subprocess.PIPE,
129
- )
 
 
 
 
 
 
 
 
130
 
131
- # espeak-ng writes a full WAV header then data. We just forward in chunks.
132
- try:
133
- while True:
134
- chunk = await proc.stdout.read(4096)
135
- if not chunk:
136
- break
137
- await ws.send_bytes(chunk)
138
- rc = await proc.wait()
139
- if rc != 0:
140
- err = (await proc.stderr.read()).decode("utf-8","ignore")
141
- await ws.send_text(json.dumps({"event":"error","detail":f"espeak-ng failed rc={rc}: {err[:200]}" }))
142
- await ws.send_text(json.dumps({"event":"done"}))
143
- finally:
144
- with anyio.move_on_after(0.1):
145
- proc.kill()
146
  except WebSocketDisconnect:
147
  pass
148
  except Exception as e:
149
- with anyio.move_on_after(0.1):
150
- await ws.send_text(json.dumps({"event":"error","detail":str(e)}))
151
- with anyio.move_on_after(0.1):
152
- await ws.close()
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import wave
5
+ from pathlib import Path
6
+ from typing import Optional, Dict, Any
7
+
8
+ import uvicorn
9
+ import requests
10
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
11
+ from fastapi.responses import JSONResponse
12
+ from fastapi.staticfiles import StaticFiles
13
+
14
+ # Piper (CPU TTS)
15
+ from piper.voice import PiperVoice
16
+
17
+ # ------------ Config ------------
18
+ BASE_DIR = Path(os.getenv("BASE_DIR", "/tmp/brain_app")).resolve()
19
+ FILES_DIR = (BASE_DIR / "files").resolve()
20
+ FILES_DIR.mkdir(parents=True, exist_ok=True)
21
+
22
+ VOICE_DIR = Path(os.getenv("VOICE_DIR", "/home/user/voices")).resolve()
23
+ VOICE_DIR.mkdir(parents=True, exist_ok=True)
24
+
25
+ DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-lessac-high")
26
+ DEFAULT_SR = 22050
27
+ DEFAULT_CHANNELS = 1
28
+
29
+ # Hugging Face Piper voice (Lessac high quality)
30
+ PIPER_HF_BASE = (
31
+ "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/"
32
+ "en/en_US/lessac/high"
33
  )
34
 
35
+ app = FastAPI(title="ActualTTS (CPU Piper, streaming)")
36
+ app.mount("/file", StaticFiles(directory=str(FILES_DIR)), name="file")
37
+
38
+ # ------------ Voice Loader ------------
39
+
40
+ _loaded_voices: Dict[str, PiperVoice] = {}
41
+
42
+ def list_voices() -> list[str]:
43
+ return sorted([p.stem for p in VOICE_DIR.glob("*.onnx")])
44
+
45
+ def ensure_voice(voice: str) -> tuple[Path, Path]:
46
+ """
47
+ Ensure .onnx and .onnx.json for 'voice' exist. If not and voice is
48
+ en_US-lessac-high, download them automatically.
49
+ """
50
+ onnx = VOICE_DIR / f"{voice}.onnx"
51
+ cfg = VOICE_DIR / f"{voice}.onnx.json"
52
+
53
+ if onnx.exists() and cfg.exists():
54
+ return onnx, cfg
55
+
56
+ if voice == "en_US-lessac-high":
57
+ files = {
58
+ onnx: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx",
59
+ cfg: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx.json",
60
+ }
61
+ for path, url in files.items():
62
+ r = requests.get(url, timeout=180)
63
+ r.raise_for_status()
64
+ path.write_bytes(r.content)
65
+ return onnx, cfg
66
+
67
+ raise FileNotFoundError(f"Voice '{voice}' not found in {VOICE_DIR}")
68
+
69
+ def get_voice(voice: str) -> PiperVoice:
70
+ if voice in _loaded_voices:
71
+ return _loaded_voices[voice]
72
+ onnx, cfg = ensure_voice(voice)
73
+ v = PiperVoice.load(str(onnx), config_path=str(cfg))
74
+ _loaded_voices[voice] = v
75
+ return v
76
+
77
+ # ------------ Helpers ------------
78
+
79
+ def prosody(body: Dict[str, Any]) -> dict:
80
+ """
81
+ Extract prosody params with natural defaults.
82
+ """
83
+ return {
84
+ "length_scale": float(body.get("length_scale", 1.12)), # a bit slower
85
+ "noise_scale": float(body.get("noise_scale", 0.33)), # reduce buzz
86
+ "noise_w": float(body.get("noise_w", 0.8)), # stabilize
87
+ "sentence_silence": float(body.get("sentence_pause", 0.18)),
88
+ }
89
+
90
+ def write_wav_int16(path: Path, sr: int, pcm: bytes, channels: int = 1):
91
+ with wave.open(str(path), "wb") as wf:
92
+ wf.setnchannels(channels)
93
+ wf.setsampwidth(2) # int16
94
+ wf.setframerate(sr)
95
+ wf.writeframes(pcm)
96
+
97
+ # ------------ Routes ------------
98
 
99
  @app.get("/health")
100
  def health():
101
  return {
102
  "ok": True,
103
+ "engine": "piper-tts (CPU)",
104
  "default_voice": DEFAULT_VOICE,
105
+ "voice_dir": str(VOICE_DIR),
106
+ "available_voices": list_voices(),
107
+ "files_dir": str(FILES_DIR),
108
+ "tip": "Use WS /ws/tts for streaming or POST /speak for one-shot",
109
  }
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  @app.post("/speak")
112
+ async def speak(request: Request):
113
+ """
114
+ JSON body:
115
+ {
116
+ "text": "Hello world",
117
+ "voice": "en_US-lessac-high",
118
+ "rate_wpm": 165,
119
+ "length_scale": 1.12, "noise_scale": 0.33, "noise_w": 0.8, "sentence_pause": 0.18
120
+ }
121
+ Returns: { ok, audio_url, sr, channels }
122
+ """
123
+ try:
124
+ body = await request.json()
125
+ except Exception:
126
+ return JSONResponse(
127
+ status_code=400,
128
+ content={"ok": False, "error": "Invalid JSON body"}
129
+ )
130
+
131
+ text: str = str(body.get("text", "")).strip()
132
+ if not text:
133
+ return JSONResponse(status_code=400, content={"ok": False, "error": "Missing text"})
134
+
135
+ voice = body.get("voice", DEFAULT_VOICE)
136
  try:
137
+ voice_obj = get_voice(voice)
138
  except Exception as e:
139
+ return JSONResponse(status_code=400, content={
140
+ "ok": False,
141
+ "error": f"Voice '{voice}' not found. Available: {list_voices()}"
142
+ })
143
+
144
+ # Piper ignores WPM internally; we simulate tempo via length_scale default.
145
+ p = prosody(body)
146
+ sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
147
+ ch = DEFAULT_CHANNELS
148
+
149
+ # Synthesize to PCM buffer
150
+ pcm_chunks: list[bytes] = []
151
+ for chunk in voice_obj.synthesize_stream(
152
+ text,
153
+ length_scale=p["length_scale"],
154
+ noise_scale=p["noise_scale"],
155
+ noise_w=p["noise_w"],
156
+ sentence_silence=p["sentence_silence"],
157
+ ):
158
+ if isinstance(chunk, (bytes, bytearray)):
159
+ pcm_chunks.append(bytes(chunk))
160
+
161
+ # small tail-silence to avoid cut offs
162
+ tail_ms = int(0.22 * 1000)
163
+ tail_frames = int(sr * (tail_ms / 1000.0))
164
+ pcm_chunks.append(b"\x00" * (tail_frames * ch * 2))
165
+
166
+ pcm = b"".join(pcm_chunks)
167
+ fname = f"tts-{int(time.time()*1000)}.wav"
168
+ fpath = FILES_DIR / fname
169
+ write_wav_int16(fpath, sr, pcm, channels=ch)
170
+
171
+ return {
172
+ "ok": True,
173
+ "audio_url": f"/file/{fname}",
174
+ "sr": sr,
175
+ "channels": ch,
176
+ }
177
+
178
  @app.websocket("/ws/tts")
179
  async def ws_tts(ws: WebSocket):
180
+ """
181
+ Protocol:
182
+ <- {"event":"init","voice":"en_US-lessac-high"}
183
+ <- {"event":"speak","text":"Hello there...","length_scale":1.12,...}
184
+ -> {"event":"ready","sr":22050,"channels":1}
185
+ -> <binary PCM16> ... many frames ...
186
+ -> {"event":"done"}
187
+ """
188
  await ws.accept()
189
+ voice_name = DEFAULT_VOICE
190
+ voice_obj: Optional[PiperVoice] = None
191
+ sr = DEFAULT_SR
192
+ channels = DEFAULT_CHANNELS
193
+
194
  try:
195
+ while True:
196
+ raw = await ws.receive_text()
197
+ msg = json.loads(raw)
198
+
199
+ if msg.get("event") == "init":
200
+ voice_name = msg.get("voice", DEFAULT_VOICE)
201
+ voice_obj = get_voice(voice_name)
202
+ sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
203
+ await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
204
+
205
+ elif msg.get("event") == "speak":
206
+ if not voice_obj:
207
+ voice_obj = get_voice(voice_name)
208
+ sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
209
+ await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
210
+
211
+ text = str(msg.get("text", "")).strip()
212
+ if not text:
213
+ await ws.send_text(json.dumps({"event": "error", "detail": "Missing text"}))
214
+ continue
215
+
216
+ p = prosody(msg)
217
+
218
+ # Stream PCM16 chunks as binary frames
219
+ for chunk in voice_obj.synthesize_stream(
220
+ text,
221
+ length_scale=p["length_scale"],
222
+ noise_scale=p["noise_scale"],
223
+ noise_w=p["noise_w"],
224
+ sentence_silence=p["sentence_silence"],
225
+ ):
226
+ if isinstance(chunk, (bytes, bytearray)):
227
+ await ws.send_bytes(bytes(chunk))
228
+
229
+ # tail-silence 220 ms
230
+ tail_frames = int(sr * 0.22)
231
+ await ws.send_bytes(b"\x00" * (tail_frames * channels * 2))
232
+
233
+ await ws.send_text(json.dumps({"event": "done"}))
234
+
235
+ else:
236
+ await ws.send_text(json.dumps({"event": "error", "detail": "Unknown event"}))
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  except WebSocketDisconnect:
239
  pass
240
  except Exception as e:
241
+ try:
242
+ await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
243
+ except Exception:
244
+ pass
245
+
246
+ if __name__ == "__main-_":
247
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)