Percy3822 commited on
Commit
976e3b5
·
verified ·
1 Parent(s): ff0f504

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -93
app.py CHANGED
@@ -1,117 +1,376 @@
1
- # app.py
2
- import os, io, time, uuid, shutil, tempfile
3
- from pathlib import Path
4
- from fastapi import FastAPI, Request
5
- from fastapi.responses import FileResponse, StreamingResponse, JSONResponse
6
- from fastapi.websockets import WebSocket
7
- from pydantic import BaseModel
8
  import subprocess
9
- import wave
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # ========== CONFIG ==========
14
- ROOT_DIR = Path("/tmp/tts_app")
15
- VOICES_DIR = ROOT_DIR / "voices"
16
- FILES_DIR = ROOT_DIR / "files"
17
- VOICES_DIR.mkdir(parents=True, exist_ok=True)
18
- FILES_DIR.mkdir(parents=True, exist_ok=True)
19
- DEFAULT_VOICE = "en_US-libritts-high"
20
- DEFAULT_SR = 22050
21
 
22
- # ========== HEALTH ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  @app.get("/health")
24
- async def health():
 
 
 
 
 
 
 
 
 
 
25
  return {
26
  "ok": True,
27
  "engine": "piper-tts (CLI, CPU)",
28
  "default_voice": DEFAULT_VOICE,
29
  "voice_dir": str(VOICES_DIR),
30
- "available_voices": [f.stem for f in VOICES_DIR.glob("*.onnx")],
31
  "files_dir": str(FILES_DIR),
32
  }
33
 
34
- # ========== SPEAK (HTTP) ==========
35
- class SpeakRequest(BaseModel):
36
- text: str
37
- voice: str = DEFAULT_VOICE
38
- rate_wpm: int = 170
39
- length_scale: float = 1.0
40
- noise_scale: float = 0.33
41
- noise_w: float = 0.5
 
 
 
 
 
42
 
43
  @app.post("/speak")
44
- async def speak(req: SpeakRequest):
45
- out_path = FILES_DIR / f"{uuid.uuid4().hex}.wav"
46
- voice_path = VOICES_DIR / f"{req.voice}.onnx"
47
-
48
- if not voice_path.exists():
49
- return JSONResponse({"error": "Voice not found."}, status_code=404)
50
-
51
- cmd = [
52
- "piper",
53
- "--model", str(voice_path),
54
- "--output_file", str(out_path),
55
- "--text", req.text,
56
- "--length_scale", str(req.length_scale),
57
- "--noise_scale", str(req.noise_scale),
58
- "--noise_w", str(req.noise_w),
59
- ]
60
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- return FileResponse(out_path, media_type="audio/wav")
63
 
64
- # ========== STREAM (WebSocket) ==========
65
  @app.websocket("/ws/tts")
66
- async def tts_stream(websocket: WebSocket):
67
- await websocket.accept()
68
  voice = DEFAULT_VOICE
69
- settings = {
70
- "length_scale": 1.0,
71
- "noise_scale": 0.33,
72
- "noise_w": 0.5
73
- }
74
-
75
- temp_file = FILES_DIR / f"{uuid.uuid4().hex}.wav"
76
- wave_writer = wave.open(str(temp_file), 'wb')
77
- wave_writer.setnchannels(1)
78
- wave_writer.setsampwidth(2)
79
- wave_writer.setframerate(DEFAULT_SR)
80
 
81
  try:
82
  while True:
83
- data = await websocket.receive_text()
84
-
85
- if data.startswith("{") and "text" in data:
86
- import json
87
- payload = json.loads(data)
88
- text = payload.get("text", "")
89
- voice = payload.get("voice", DEFAULT_VOICE)
90
- settings["length_scale"] = float(payload.get("length_scale", 1.0))
91
- settings["noise_scale"] = float(payload.get("noise_scale", 0.33))
92
- settings["noise_w"] = float(payload.get("noise_w", 0.5))
93
-
94
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
95
- cmd = [
96
- "piper",
97
- "--model", str(VOICES_DIR / f"{voice}.onnx"),
98
- "--output_file", tmp.name,
99
- "--text", text,
100
- "--length_scale", str(settings["length_scale"]),
101
- "--noise_scale", str(settings["noise_scale"]),
102
- "--noise_w", str(settings["noise_w"]),
103
- ]
104
- subprocess.run(cmd, check=True)
105
-
106
- with open(tmp.name, "rb") as f:
107
- audio = f.read()
108
- await websocket.send_bytes(audio)
109
-
110
- tmp.close()
111
- os.unlink(tmp.name)
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  except Exception as e:
114
- print(f"[TTS WS Error] {e}")
115
- finally:
116
- wave_writer.close()
117
- await websocket.close()
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import re
5
+ import shlex
 
 
6
  import subprocess
7
+ import tarfile
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Optional, Dict, Any
11
+
12
+ import uvicorn
13
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
14
+ from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
15
+
16
+ # -------------------------
17
+ # Writable directory picker
18
+ # -------------------------
19
+
20
+ def pick_writable_dir(*candidates: Path) -> Path:
21
+ errs = []
22
+ for p in candidates:
23
+ if not p:
24
+ continue
25
+ try:
26
+ p.mkdir(parents=True, exist_ok=True)
27
+ probe = p / ".probe"
28
+ with open(probe, "wb") as f:
29
+ f.write(b"ok")
30
+ probe.unlink(missing_ok=True)
31
+ return p
32
+ except Exception as e:
33
+ errs.append(f"{p}: {type(e)._name_}({e})")
34
+ raise RuntimeError("No writable dir. Tried:\n" + "\n".join(errs))
35
+
36
+ # Honors env overrides; then tries common writable places on HF Spaces
37
+ ENV_DIR = os.getenv("TTS_DATA_DIR")
38
+ VOICES_DIR = None
39
+ FILES_DIR = None
40
+
41
+ def init_dirs():
42
+ global VOICES_DIR, FILES_DIR
43
+ cand_voices = []
44
+ if ENV_DIR:
45
+ cand_voices.append(Path(ENV_DIR) / "voices")
46
+ cand_voices += [
47
+ Path("/home/user/.cache/actualtts/voices"),
48
+ Path("/home/user/voices"),
49
+ Path("/tmp/actualtts/voices"),
50
+ Path("/dev/shm/actualtts_voices"),
51
+ ]
52
+ VOICES_DIR = pick_writable_dir(*cand_voices)
53
+
54
+ cand_files = []
55
+ if ENV_DIR:
56
+ cand_files.append(Path(ENV_DIR) / "files")
57
+ cand_files += [
58
+ Path("/home/user/.cache/actualtts/files"),
59
+ Path("/tmp/actualtts/files"),
60
+ Path("/dev/shm/actualtts_files"),
61
+ ]
62
+ FILES_DIR = pick_writable_dir(*cand_files)
63
+
64
+ init_dirs()
65
+
66
+ # -------------------------
67
+ # Piper CLI integration
68
+ # -------------------------
69
+
70
+ # Piper binary is preinstalled in CPU Spaces images that have Piper CLI.
71
+ # If your image differs, set PIPER_BIN env to the correct path.
72
+ PIPER_BIN = os.getenv("PIPER_BIN", "piper")
73
+
74
+ # A small catalog of good CPU voices (VITS-based) hosted on HF.
75
+ HF_VOICES: Dict[str, str] = {
76
+ "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-libritts-high.onnx.tar.gz",
77
+ "en_US-lessac-high": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-lessac-high.onnx.tar.gz",
78
+ "en_US-amy-medium": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-amy-medium.onnx.tar.gz",
79
+ }
80
+
81
+ DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
82
+ DEFAULT_SR = 22050 # Piper typically outputs 22050 Hz
83
+ DEFAULT_CHANNELS = 1
84
+
85
+ _HTTP_CLIENT = None # lazy import only if needed
86
+
87
+
88
+ def _http():
89
+ global _HTTP_CLIENT
90
+ if _HTTP_CLIENT is None:
91
+ import requests
92
+ _HTTP_CLIENT = requests.Session()
93
+ _HTTP_CLIENT.headers.update({"User-Agent": "ActualTTS/CPU"})
94
+ return _HTTP_CLIENT
95
+
96
+
97
+ def ensure_voice(voice_id: str) -> Dict[str, Path]:
98
+ """
99
+ Ensure the given voice is present locally. If missing, download and extract.
100
+ Returns {"model": Path, "config": Path}
101
+ """
102
+ vdir = VOICES_DIR / voice_id
103
+ model = vdir / f"{voice_id}.onnx"
104
+ config = vdir / f"{voice_id}.onnx.json"
105
+
106
+ if model.exists() and config.exists():
107
+ return {"model": model, "config": config}
108
+
109
+ url = HF_VOICES.get(voice_id)
110
+ if not url:
111
+ # Try a heuristic: accept 'en-us' as a generic alias
112
+ if voice_id.lower() in ("en-us", "en_us", "english"):
113
+ voice_id = "en_US-libritts-high"
114
+ url = HF_VOICES[voice_id]
115
+ vdir = VOICES_DIR / voice_id
116
+ model = vdir / f"{voice_id}.onnx"
117
+ config = vdir / f"{voice_id}.onnx.json"
118
+ else:
119
+ raise RuntimeError(f"Unknown/unsupported voice '{voice_id}'. Known: {list(HF_VOICES)}")
120
+
121
+ vdir.mkdir(parents=True, exist_ok=True)
122
+ tar_path = vdir / f"{voice_id}.onnx.tar.gz"
123
+
124
+ # Download
125
+ r = _http().get(url, timeout=120, stream=True)
126
+ r.raise_for_status()
127
+ with open(tar_path, "wb") as f:
128
+ for chunk in r.iter_content(1 << 16):
129
+ if chunk:
130
+ f.write(chunk)
131
+
132
+ # Extract
133
+ with tarfile.open(tar_path, "r:gz") as tf:
134
+ tf.extractall(vdir)
135
+
136
+ tar_path.unlink(missing_ok=True)
137
+
138
+ if not model.exists() or not config.exists():
139
+ raise RuntimeError(f"Voice files not found after extraction: {voice_id}")
140
+
141
+ return {"model": model, "config": config}
142
+
143
+
144
+ def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optional[Path] = None,
145
+ length_scale: float = 1.10, noise_scale: float = 0.35, noise_w: float = 0.90) -> list:
146
+ vc = ensure_voice(voice_id)
147
+ args = [
148
+ PIPER_BIN,
149
+ "-m", str(vc["model"]),
150
+ "-c", str(vc["config"]),
151
+ "-q", # quieter logs
152
+ "--length_scale", str(length_scale),
153
+ "--noise_scale", str(noise_scale),
154
+ "--noise_w", str(noise_w),
155
+ ]
156
+ if to_stdout:
157
+ args += ["-f", "-"]
158
+ else:
159
+ if out_path is None:
160
+ raise ValueError("out_path is required when to_stdout=False")
161
+ args += ["-f", str(out_path)]
162
+ return args
163
+
164
 
165
+ async def run_piper_to_file(text: str, voice_id: str, out_path: Path,
166
+ length_scale: float, noise_scale: float, noise_w: float) -> None:
167
+ cmd = build_piper_cmd(text, voice_id, to_stdout=False, out_path=out_path,
168
+ length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
169
+ proc = await asyncio.create_subprocess_exec(
170
+ *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
171
+ )
172
+ # write the text
173
+ await proc.stdin.write(text.encode("utf-8"))
174
+ await proc.stdin.drain()
175
+ proc.stdin.close()
176
+ await proc.wait()
177
+ if proc.returncode != 0:
178
+ stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
179
+ raise RuntimeError(f"Piper failed (code {proc.returncode}). Stderr:\n{stderr}")
180
 
 
 
 
 
 
 
 
 
181
 
182
+ async def run_piper_stream(text: str, voice_id: str, websocket: WebSocket,
183
+ length_scale: float, noise_scale: float, noise_w: float) -> None:
184
+ """
185
+ Stream binary PCM16 via WS while Piper renders to stdout.
186
+ """
187
+ cmd = build_piper_cmd(text, voice_id, to_stdout=True,
188
+ length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
189
+
190
+ # Tell client we're ready (so it can open its audio device early)
191
+ await websocket.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
192
+
193
+ proc = await asyncio.create_subprocess_exec(
194
+ *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
195
+ )
196
+ await proc.stdin.write(text.encode("utf-8"))
197
+ await proc.stdin.drain()
198
+ proc.stdin.close()
199
+
200
+ try:
201
+ while True:
202
+ chunk = await proc.stdout.read(4096)
203
+ if not chunk:
204
+ break
205
+ # Piper emits WAV unless given raw flag; CLI doesn't expose raw PCM easily.
206
+ # However, Piper's default stdout with -f - is WAV. Many clients can accept WAV frames incrementally.
207
+ # Your client plays raw PCM; so we keep WAV to avoid truncation and let the client handle it,
208
+ # or we can strip the first 44 bytes (WAV header) once and then stream the rest as PCM16.
209
+ # We'll do header-strip-once below:
210
+
211
+ # Detect/strip WAV header (44 bytes) exactly once per stream:
212
+ if len(chunk) >= 44 and chunk[0:4] == b"RIFF" and chunk[8:12] == b"WAVE":
213
+ # Skip the 44-byte header (simple WAV)
214
+ chunk = chunk[44:]
215
+ if not chunk:
216
+ continue
217
+
218
+ await websocket.send_bytes(chunk)
219
+
220
+ await proc.wait()
221
+ if proc.returncode != 0:
222
+ stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
223
+ await websocket.send_text(json.dumps({"event": "error", "detail": stderr}))
224
+ else:
225
+ await websocket.send_text(json.dumps({"event": "done"}))
226
+ except WebSocketDisconnect:
227
+ try:
228
+ proc.kill()
229
+ except Exception:
230
+ pass
231
+
232
+
233
+ # ---------------
234
+ # FastAPI wiring
235
+ # ---------------
236
+
237
+ app = FastAPI(title="ActualTTS (CPU)")
238
+
239
  @app.get("/health")
240
+ def health():
241
+ def list_voices():
242
+ out = []
243
+ for child in VOICES_DIR.iterdir():
244
+ if not child.is_dir():
245
+ continue
246
+ name = child.name
247
+ if (child / f"{name}.onnx").exists() and (child / f"{name}.onnx.json").exists():
248
+ out.append(name)
249
+ return out
250
+
251
  return {
252
  "ok": True,
253
  "engine": "piper-tts (CLI, CPU)",
254
  "default_voice": DEFAULT_VOICE,
255
  "voice_dir": str(VOICES_DIR),
256
+ "available_voices": list_voices(),
257
  "files_dir": str(FILES_DIR),
258
  }
259
 
260
+
261
+ @app.get("/")
262
+ def root():
263
+ return PlainTextResponse("ActualTTS (CPU) — use POST /speak or WS /ws/tts")
264
+
265
+
266
+ @app.get("/file/{name}")
267
+ def get_file(name: str):
268
+ path = FILES_DIR / name
269
+ if not path.exists():
270
+ return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
271
+ return FileResponse(path)
272
+
273
 
274
  @app.post("/speak")
275
+ async def speak(request: Request):
276
+ """
277
+ JSON body:
278
+ {
279
+ "text": "Hello world",
280
+ "voice": "en_US-libritts-high",
281
+ "length_scale": 1.10,
282
+ "noise_scale": 0.35,
283
+ "noise_w": 0.90
284
+ }
285
+ Returns:
286
+ {"ok": true, "audio_url": "/file/tts-<ts>.wav"}
287
+ """
288
+ try:
289
+ body = await request.json()
290
+ except Exception:
291
+ return JSONResponse({"detail": "Invalid JSON"}, status_code=400)
292
+
293
+ text = (body.get("text") or "").strip()
294
+ if not text:
295
+ return JSONResponse({"detail": "Missing text"}, status_code=400)
296
+
297
+ voice = (body.get("voice") or DEFAULT_VOICE).strip()
298
+ length_scale = float(body.get("length_scale", 1.10))
299
+ noise_scale = float(body.get("noise_scale", 0.35))
300
+ noise_w = float(body.get("noise_w", 0.90))
301
+
302
+ # prepare output
303
+ ts = int(time.time() * 1000)
304
+ out_path = FILES_DIR / f"tts-{ts}.wav"
305
+
306
+ try:
307
+ # ensure voice (download if needed)
308
+ ensure_voice(voice)
309
+ # run piper CLI to a wav file
310
+ await run_piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w)
311
+ except Exception as e:
312
+ return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
313
+
314
+ return {"ok": True, "audio_url": f"/file/{out_path.name}"}
315
 
 
316
 
 
317
  @app.websocket("/ws/tts")
318
+ async def ws_tts(ws: WebSocket):
319
+ await ws.accept()
320
  voice = DEFAULT_VOICE
321
+ length_scale = 1.10
322
+ noise_scale = 0.35
323
+ noise_w = 0.90
324
+ text_to_speak: Optional[str] = None
 
 
 
 
 
 
 
325
 
326
  try:
327
  while True:
328
+ msg = await ws.receive_text()
329
+ try:
330
+ data = json.loads(msg)
331
+ except Exception:
332
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ ev = data.get("event")
335
+ if ev == "init":
336
+ voice = (data.get("voice") or voice).strip()
337
+ # allow optional tuning via WS
338
+ if "length_scale" in data: length_scale = float(data["length_scale"])
339
+ if "noise_scale" in data: noise_scale = float(data["noise_scale"])
340
+ if "noise_w" in data: noise_w = float(data["noise_w"])
341
+ # ensure voice now so we can send ready immediately
342
+ try:
343
+ ensure_voice(voice)
344
+ except Exception as e:
345
+ await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
346
+ await ws.close()
347
+ return
348
+ await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
349
+
350
+ elif ev == "speak":
351
+ text_to_speak = (data.get("text") or "").strip()
352
+ if not text_to_speak:
353
+ await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
354
+ continue
355
+ # stream via stdout
356
+ await run_piper_stream(text_to_speak, voice, ws, length_scale, noise_scale, noise_w)
357
+ else:
358
+ # ignore unknown events
359
+ pass
360
+
361
+ except WebSocketDisconnect:
362
+ return
363
  except Exception as e:
364
+ try:
365
+ await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
366
+ except Exception:
367
+ pass
368
+ try:
369
+ await ws.close()
370
+ except Exception:
371
+ pass
372
+
373
+
374
+ if __name__ == "__main__":
375
+ # For local debug; Spaces uses Entrypoint/Cmd
376
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)