OrbitMC commited on
Commit
b0a95c7
Β·
verified Β·
1 Parent(s): 9060018

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +371 -61
Dockerfile CHANGED
@@ -1,65 +1,375 @@
1
- # ==============================================================================
2
- # Dockerfile β€” Headless Qwen-3.5 Chat + Piper TTS Β· Hugging Face Docker Space
3
- # ==============================================================================
4
- FROM python:3.10-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  WORKDIR /app
7
 
8
- # ── 1. System dependencies ───────────────────────────────────────────────────
9
- # build-essential + cmake β†’ compile llama-cpp-python from source
10
- # libgomp1 β†’ OpenMP runtime (used by llama.cpp & ONNX Runtime)
11
- # ca-certificates + wget β†’ HTTPS downloads
12
- RUN apt-get update && \
13
- apt-get install -y --no-install-recommends \
14
- wget ca-certificates \
15
- build-essential cmake g++ \
16
- libgomp1 && \
17
- rm -rf /var/lib/apt/lists/*
18
-
19
- # ── 2. Piper TTS pre-built binary (Linux x86-64) ─────────────────────────────
20
- RUN wget -q \
21
- "https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz" \
22
- -O /tmp/piper.tar.gz && \
23
- tar -xzf /tmp/piper.tar.gz -C /app && \
24
- rm /tmp/piper.tar.gz && \
25
- chmod +x /app/piper/piper
26
-
27
- # ── 3. TTS voice – fast, realistic female English voice ───────────────────────
28
- # Piper auto-loads voice.onnx.json when it sits next to voice.onnx
29
- RUN mkdir -p /app/tts && \
30
- wget -q \
31
- "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx" \
32
- -O /app/tts/voice.onnx && \
33
- wget -q \
34
- "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json" \
35
- -O /app/tts/voice.onnx.json
36
-
37
- # ── 4. LLM GGUF (Qwen 3.5 0.8B, Q3_K_XL quant from Unsloth) ────────────────
38
- RUN mkdir -p /app/models && \
39
- wget --progress=dot:mega \
40
- "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-UD-Q3_K_XL.gguf" \
41
- -O /app/models/qwen.gguf
42
-
43
- # ── 5. Python packages ───────────────────────────────────────────────────────
44
  RUN pip install --no-cache-dir \
45
- flask \
46
- llama-cpp-python
47
-
48
- # ── 6. Application code + runtime dirs ────────────────────────────────────────
49
- COPY app.py /app/app.py
50
- RUN mkdir -p /tmp/audio
51
-
52
- # ── 7. Environment β€” so app.py finds every binary & asset by env-var ──────────
53
- ENV PIPER_BIN="/app/piper/piper" \
54
- TTS_VOICE="/app/tts/voice.onnx" \
55
- LLM_PATH="/app/models/qwen.gguf" \
56
- AUDIO_DIR="/tmp/audio" \
57
- LD_LIBRARY_PATH="/app/piper"
58
-
59
- # ── 8. Non-root user (HF Spaces requirement) ─────────────────────────────────
60
- RUN useradd -m -u 1000 user && \
61
- chown -R user:user /app /tmp/audio
62
- USER user
63
-
64
- EXPOSE 7860
65
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Dockerfile β€” Fast Anime-English TTS Server (Piper-based)
3
+ # ============================================================
4
+ # Build: docker build -t anime-tts .
5
+ # Run: docker run -p 5000:5000 anime-tts
6
+ # Usage: curl -X POST http://localhost:5000/tts \
7
+ # -H "Content-Type: application/json" \
8
+ # -d '{"text":"Hello senpai! Welcome to the anime world!"}' \
9
+ # --output speech.wav
10
+ # ============================================================
11
+
12
+ FROM python:3.11-slim
13
+
14
+ # Install system deps
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ wget \
17
+ curl \
18
+ libsndfile1 \
19
+ ffmpeg \
20
+ && rm -rf /var/lib/apt/lists/*
21
 
22
  WORKDIR /app
23
 
24
+ # Install Python dependencies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  RUN pip install --no-cache-dir \
26
+ flask \
27
+ piper-tts \
28
+ numpy \
29
+ scipy
30
+
31
+ # --------------------------------------------------------------------------
32
+ # Download a fast, high-quality anime-style English voice
33
+ # We use "lessac" (medium quality, very expressive/bright) as the base
34
+ # and also download an anime-adjacent voice.
35
+ #
36
+ # Available voices: https://huggingface.co/rhasspy/piper-voices/tree/main
37
+ #
38
+ # Voice options (pick ONE pair β€” model + config):
39
+ # 1) en_US-lessac-medium β€” bright, expressive female (anime-adjacent)
40
+ # 2) en_US-libritts_r-medium β€” multiple speakers, some sound anime-like
41
+ # 3) en_GB-jenny_dioco-medium β€” young British female
42
+ #
43
+ # We'll download TWO voices so users can pick via the API.
44
+ # --------------------------------------------------------------------------
45
+
46
+ RUN mkdir -p /app/voices
47
+
48
+ # Voice 1: Lessac (bright, expressive, anime-adjacent female)
49
+ RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx" \
50
+ -O /app/voices/lessac.onnx && \
51
+ wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json" \
52
+ -O /app/voices/lessac.onnx.json
53
+
54
+ # Voice 2: Jenny Dioco (young, bright British female β€” anime dub style)
55
+ RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/jenny_dioco/medium/en_GB-jenny_dioco-medium.onnx" \
56
+ -O /app/voices/jenny.onnx && \
57
+ wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/jenny_dioco/medium/en_GB-jenny_dioco-medium.onnx.json" \
58
+ -O /app/voices/jenny.onnx.json
59
+
60
+ # Voice 3: Amy (medium, clear North-American β€” works well sped up)
61
+ RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx" \
62
+ -O /app/voices/amy.onnx && \
63
+ wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx.json" \
64
+ -O /app/voices/amy.onnx.json
65
+
66
+ # --------------------------------------------------------------------------
67
+ # Create the TTS API server
68
+ # --------------------------------------------------------------------------
69
+ RUN cat > /app/server.py << 'PYTHON_SERVER'
70
+ #!/usr/bin/env python3
71
+ """
72
+ Fast Anime-Voice TTS Server using Piper.
73
+
74
+ Endpoints:
75
+ POST /tts β€” Generate speech, return WAV
76
+ POST /tts/stream β€” Generate speech, return streaming WAV
77
+ GET /voices β€” List available voices
78
+ GET /health β€” Health check
79
+
80
+ JSON body for /tts:
81
+ {
82
+ "text": "Hello world!",
83
+ "voice": "lessac", // optional: lessac, jenny, amy (default: lessac)
84
+ "speed": 1.0, // optional: 0.5-2.0 (default: 1.0)
85
+ "pitch_shift": 0, // optional: semitones to shift pitch (for anime effect, try 2-4)
86
+ "output_format": "wav" // optional: wav, mp3 (default: wav)
87
+ }
88
+ """
89
+
90
+ import io
91
+ import os
92
+ import time
93
+ import wave
94
+ import struct
95
+ import subprocess
96
+ import tempfile
97
+ import logging
98
+ from pathlib import Path
99
+ from typing import Optional
100
+
101
+ import numpy as np
102
+ from flask import Flask, request, jsonify, send_file, Response
103
+ from piper import PiperVoice
104
+
105
+ logging.basicConfig(level=logging.INFO)
106
+ logger = logging.getLogger("anime-tts")
107
+
108
+ app = Flask(__name__)
109
+
110
+ # ---- Voice Registry ----
111
+ VOICES_DIR = Path("/app/voices")
112
+ VOICE_MAP = {
113
+ "lessac": VOICES_DIR / "lessac.onnx",
114
+ "jenny": VOICES_DIR / "jenny.onnx",
115
+ "amy": VOICES_DIR / "amy.onnx",
116
+ }
117
+
118
+ # Cache loaded voices for speed
119
+ _voice_cache: dict[str, PiperVoice] = {}
120
+
121
+
122
+ def get_voice(name: str) -> PiperVoice:
123
+ """Load and cache a Piper voice."""
124
+ if name not in _voice_cache:
125
+ model_path = VOICE_MAP.get(name)
126
+ if not model_path or not model_path.exists():
127
+ raise ValueError(f"Voice '{name}' not found. Available: {list(VOICE_MAP.keys())}")
128
+ logger.info(f"Loading voice: {name} from {model_path}")
129
+ _voice_cache[name] = PiperVoice.load(str(model_path))
130
+ logger.info(f"Voice '{name}' loaded successfully")
131
+ return _voice_cache[name]
132
+
133
+
134
+ def synthesize_speech(
135
+ text: str,
136
+ voice_name: str = "lessac",
137
+ speed: float = 1.0,
138
+ pitch_shift: int = 0,
139
+ output_format: str = "wav",
140
+ ) -> io.BytesIO:
141
+ """Synthesize text to speech and return audio bytes."""
142
+
143
+ voice = get_voice(voice_name)
144
+
145
+ # Synthesize to WAV in memory
146
+ wav_buffer = io.BytesIO()
147
+
148
+ # Piper uses length_scale for speed (inverse: lower = faster)
149
+ length_scale = 1.0 / max(0.25, min(speed, 4.0))
150
+
151
+ with wave.open(wav_buffer, "wb") as wav_file:
152
+ voice.synthesize(
153
+ text,
154
+ wav_file,
155
+ length_scale=length_scale,
156
+ sentence_silence=0.15,
157
+ )
158
+
159
+ wav_buffer.seek(0)
160
+
161
+ # Apply pitch shift if requested (for anime effect)
162
+ if pitch_shift != 0 or output_format == "mp3":
163
+ wav_buffer = post_process_audio(wav_buffer, pitch_shift, output_format)
164
+
165
+ return wav_buffer
166
+
167
+
168
+ def post_process_audio(
169
+ wav_buffer: io.BytesIO,
170
+ pitch_shift: int = 0,
171
+ output_format: str = "wav",
172
+ ) -> io.BytesIO:
173
+ """Apply pitch shifting and format conversion using ffmpeg."""
174
+
175
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
176
+ tmp_in.write(wav_buffer.read())
177
+ tmp_in_path = tmp_in.name
178
+
179
+ suffix = f".{output_format}"
180
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_out:
181
+ tmp_out_path = tmp_out.name
182
+
183
+ try:
184
+ # Build ffmpeg command
185
+ cmd = ["ffmpeg", "-y", "-i", tmp_in_path]
186
+
187
+ filters = []
188
+
189
+ # Pitch shift using asetrate + aresample (fast method)
190
+ if pitch_shift != 0:
191
+ # Calculate rate multiplier from semitones
192
+ rate_mult = 2 ** (pitch_shift / 12.0)
193
+ # Read original sample rate
194
+ with wave.open(tmp_in_path, "rb") as wf:
195
+ orig_sr = wf.getframerate()
196
+ new_sr = int(orig_sr * rate_mult)
197
+ filters.append(f"asetrate={new_sr}")
198
+ filters.append(f"aresample={orig_sr}")
199
+ # Compensate tempo change from pitch shift
200
+ tempo = 1.0 / rate_mult
201
+ if 0.5 <= tempo <= 2.0:
202
+ filters.append(f"atempo={tempo}")
203
+ elif tempo < 0.5:
204
+ # Chain atempo filters for extreme values
205
+ filters.append(f"atempo=0.5,atempo={tempo/0.5}")
206
+
207
+ if filters:
208
+ cmd.extend(["-af", ",".join(filters)])
209
+
210
+ if output_format == "mp3":
211
+ cmd.extend(["-codec:a", "libmp3lame", "-q:a", "2"])
212
+
213
+ cmd.append(tmp_out_path)
214
+
215
+ result = subprocess.run(
216
+ cmd, capture_output=True, timeout=30
217
+ )
218
+
219
+ if result.returncode != 0:
220
+ logger.error(f"ffmpeg error: {result.stderr.decode()}")
221
+ # Fall back to original
222
+ wav_buffer.seek(0)
223
+ return wav_buffer
224
+
225
+ output_buffer = io.BytesIO()
226
+ with open(tmp_out_path, "rb") as f:
227
+ output_buffer.write(f.read())
228
+ output_buffer.seek(0)
229
+ return output_buffer
230
+
231
+ finally:
232
+ os.unlink(tmp_in_path)
233
+ if os.path.exists(tmp_out_path):
234
+ os.unlink(tmp_out_path)
235
+
236
+
237
+ # ---- Pre-warm default voice on startup ----
238
+ @app.before_request
239
+ def _warmup():
240
+ """Lazy warmup β€” load default voice on first request."""
241
+ app.before_request_funcs[None].remove(_warmup)
242
+ try:
243
+ get_voice("lessac")
244
+ except Exception as e:
245
+ logger.warning(f"Warmup failed: {e}")
246
+
247
+
248
+ # ---- API Routes ----
249
+
250
+ @app.route("/health", methods=["GET"])
251
+ def health():
252
+ return jsonify({"status": "ok", "engine": "piper-tts", "cached_voices": list(_voice_cache.keys())})
253
+
254
+
255
+ @app.route("/voices", methods=["GET"])
256
+ def list_voices():
257
+ voices = []
258
+ for name, path in VOICE_MAP.items():
259
+ voices.append({
260
+ "name": name,
261
+ "available": path.exists(),
262
+ "description": {
263
+ "lessac": "Bright expressive US female β€” anime-adjacent, great default",
264
+ "jenny": "Young bright British female β€” anime dub style",
265
+ "amy": "Clear US female β€” works well with pitch shift for anime effect",
266
+ }.get(name, ""),
267
+ "tip": "Try pitch_shift=2 or pitch_shift=3 for more anime-like sound",
268
+ })
269
+ return jsonify({"voices": voices})
270
+
271
+
272
+ @app.route("/tts", methods=["POST"])
273
+ def tts():
274
+ """Main TTS endpoint."""
275
+ start = time.time()
276
+
277
+ data = request.get_json(force=True, silent=True) or {}
278
+
279
+ text = data.get("text", "").strip()
280
+ if not text:
281
+ return jsonify({"error": "No text provided"}), 400
282
+
283
+ if len(text) > 10000:
284
+ return jsonify({"error": "Text too long (max 10000 chars)"}), 400
285
+
286
+ voice_name = data.get("voice", "lessac")
287
+ speed = float(data.get("speed", 1.0))
288
+ pitch_shift = int(data.get("pitch_shift", 0))
289
+ output_format = data.get("output_format", "wav").lower()
290
+
291
+ if output_format not in ("wav", "mp3"):
292
+ return jsonify({"error": "output_format must be 'wav' or 'mp3'"}), 400
293
+
294
+ if voice_name not in VOICE_MAP:
295
+ return jsonify({
296
+ "error": f"Unknown voice '{voice_name}'",
297
+ "available": list(VOICE_MAP.keys())
298
+ }), 400
299
+
300
+ try:
301
+ audio_buffer = synthesize_speech(
302
+ text=text,
303
+ voice_name=voice_name,
304
+ speed=speed,
305
+ pitch_shift=pitch_shift,
306
+ output_format=output_format,
307
+ )
308
+ except Exception as e:
309
+ logger.exception("Synthesis failed")
310
+ return jsonify({"error": str(e)}), 500
311
+
312
+ elapsed = time.time() - start
313
+ logger.info(f"TTS: {len(text)} chars, voice={voice_name}, speed={speed}, "
314
+ f"pitch={pitch_shift}, format={output_format}, time={elapsed:.3f}s")
315
+
316
+ mimetype = "audio/wav" if output_format == "wav" else "audio/mpeg"
317
+
318
+ return send_file(
319
+ audio_buffer,
320
+ mimetype=mimetype,
321
+ as_attachment=True,
322
+ download_name=f"speech.{output_format}",
323
+ )
324
+
325
+
326
+ @app.route("/tts/batch", methods=["POST"])
327
+ def tts_batch():
328
+ """Batch TTS β€” synthesize multiple texts."""
329
+ data = request.get_json(force=True, silent=True) or {}
330
+ texts = data.get("texts", [])
331
+
332
+ if not texts or not isinstance(texts, list):
333
+ return jsonify({"error": "Provide 'texts' as a list of strings"}), 400
334
+
335
+ voice_name = data.get("voice", "lessac")
336
+ speed = float(data.get("speed", 1.0))
337
+ pitch_shift = int(data.get("pitch_shift", 0))
338
+
339
+ # Concatenate all texts with pauses
340
+ combined = ". ".join(texts)
341
+
342
+ try:
343
+ audio_buffer = synthesize_speech(
344
+ text=combined,
345
+ voice_name=voice_name,
346
+ speed=speed,
347
+ pitch_shift=pitch_shift,
348
+ )
349
+ except Exception as e:
350
+ return jsonify({"error": str(e)}), 500
351
+
352
+ return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="batch.wav")
353
+
354
+
355
+ if __name__ == "__main__":
356
+ # Pre-load default voice
357
+ logger.info("Pre-loading default voice...")
358
+ try:
359
+ get_voice("lessac")
360
+ logger.info("Default voice ready!")
361
+ except Exception as e:
362
+ logger.error(f"Failed to pre-load voice: {e}")
363
+
364
+ app.run(host="0.0.0.0", port=5000, threaded=True)
365
+ PYTHON_SERVER
366
+
367
+ # Expose port
368
+ EXPOSE 5000
369
+
370
+ # Health check
371
+ HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
372
+ CMD curl -f http://localhost:5000/health || exit 1
373
+
374
+ # Run the server
375
+ CMD ["python", "/app/server.py"]