Spaces:

OrbitMC
/

vai

Sleeping

App Files Files Community

OrbitMC commited on 27 days ago

Commit

b0a95c7

verified ·

1 Parent(s): 9060018

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +371 -61

Dockerfile CHANGED Viewed

@@ -1,65 +1,375 @@
-# ==============================================================================
-# Dockerfile — Headless Qwen-3.5 Chat + Piper TTS · Hugging Face Docker Space
-# ==============================================================================
-FROM python:3.10-slim
 WORKDIR /app
-# ── 1. System dependencies ───────────────────────────────────────────────────
-#    build-essential + cmake → compile llama-cpp-python from source
-#    libgomp1               → OpenMP runtime (used by llama.cpp & ONNX Runtime)
-#    ca-certificates + wget → HTTPS downloads
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        wget ca-certificates \
-        build-essential cmake g++ \
-        libgomp1 && \
-    rm -rf /var/lib/apt/lists/*
-# ── 2. Piper TTS pre-built binary (Linux x86-64) ─────────────────────────────
-RUN wget -q \
-      "https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz" \
-      -O /tmp/piper.tar.gz && \
-    tar -xzf /tmp/piper.tar.gz -C /app && \
-    rm /tmp/piper.tar.gz && \
-    chmod +x /app/piper/piper
-# ── 3. TTS voice – fast, realistic female English voice ───────────────────────
-#    Piper auto-loads voice.onnx.json when it sits next to voice.onnx
-RUN mkdir -p /app/tts && \
-    wget -q \
-      "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx" \
-      -O /app/tts/voice.onnx && \
-    wget -q \
-      "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json" \
-      -O /app/tts/voice.onnx.json
-# ── 4. LLM GGUF (Qwen 3.5 0.8B, Q3_K_XL quant from Unsloth) ────────────────
-RUN mkdir -p /app/models && \
-    wget --progress=dot:mega \
-      "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-UD-Q3_K_XL.gguf" \
-      -O /app/models/qwen.gguf
-# ── 5. Python packages ───────────────────────────────────────────────────────
 RUN pip install --no-cache-dir \
-        flask \
-        llama-cpp-python
-# ── 6. Application code + runtime dirs ────────────────────────────────────────
-COPY app.py /app/app.py
-RUN mkdir -p /tmp/audio
-# ── 7. Environment — so app.py finds every binary & asset by env-var ──────────
-ENV PIPER_BIN="/app/piper/piper" \
-    TTS_VOICE="/app/tts/voice.onnx" \
-    LLM_PATH="/app/models/qwen.gguf" \
-    AUDIO_DIR="/tmp/audio" \
-    LD_LIBRARY_PATH="/app/piper"
-# ── 8. Non-root user (HF Spaces requirement) ─────────────────────────────────
-RUN useradd -m -u 1000 user && \
-    chown -R user:user /app /tmp/audio
-USER user
-EXPOSE 7860
-CMD ["python", "app.py"]

+# ============================================================
+# Dockerfile — Fast Anime-English TTS Server (Piper-based)
+# ============================================================
+# Build:  docker build -t anime-tts .
+# Run:    docker run -p 5000:5000 anime-tts
+# Usage:  curl -X POST http://localhost:5000/tts \
+#           -H "Content-Type: application/json" \
+#           -d '{"text":"Hello senpai! Welcome to the anime world!"}' \
+#           --output speech.wav
+# ============================================================
+FROM python:3.11-slim
+# Install system deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    curl \
+    libsndfile1 \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# Install Python dependencies
 RUN pip install --no-cache-dir \
+    flask \
+    piper-tts \
+    numpy \
+    scipy
+# --------------------------------------------------------------------------
+# Download a fast, high-quality anime-style English voice
+# We use "lessac" (medium quality, very expressive/bright) as the base
+# and also download an anime-adjacent voice.
+#
+# Available voices: https://huggingface.co/rhasspy/piper-voices/tree/main
+#
+# Voice options (pick ONE pair — model + config):
+#   1) en_US-lessac-medium    — bright, expressive female (anime-adjacent)
+#   2) en_US-libritts_r-medium — multiple speakers, some sound anime-like
+#   3) en_GB-jenny_dioco-medium — young British female
+#
+# We'll download TWO voices so users can pick via the API.
+# --------------------------------------------------------------------------
+RUN mkdir -p /app/voices
+# Voice 1: Lessac (bright, expressive, anime-adjacent female)
+RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx" \
+    -O /app/voices/lessac.onnx && \
+    wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json" \
+    -O /app/voices/lessac.onnx.json
+# Voice 2: Jenny Dioco (young, bright British female — anime dub style)
+RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/jenny_dioco/medium/en_GB-jenny_dioco-medium.onnx" \
+    -O /app/voices/jenny.onnx && \
+    wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/jenny_dioco/medium/en_GB-jenny_dioco-medium.onnx.json" \
+    -O /app/voices/jenny.onnx.json
+# Voice 3: Amy (medium, clear North-American — works well sped up)
+RUN wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx" \
+    -O /app/voices/amy.onnx && \
+    wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx.json" \
+    -O /app/voices/amy.onnx.json
+# --------------------------------------------------------------------------
+# Create the TTS API server
+# --------------------------------------------------------------------------
+RUN cat > /app/server.py << 'PYTHON_SERVER'
+#!/usr/bin/env python3
+"""
+Fast Anime-Voice TTS Server using Piper.
+Endpoints:
+  POST /tts          — Generate speech, return WAV
+  POST /tts/stream   — Generate speech, return streaming WAV
+  GET  /voices       — List available voices
+  GET  /health       — Health check
+JSON body for /tts:
+{
+  "text": "Hello world!",
+  "voice": "lessac",          // optional: lessac, jenny, amy (default: lessac)
+  "speed": 1.0,               // optional: 0.5-2.0 (default: 1.0)
+  "pitch_shift": 0,           // optional: semitones to shift pitch (for anime effect, try 2-4)
+  "output_format": "wav"      // optional: wav, mp3 (default: wav)
+}
+"""
+import io
+import os
+import time
+import wave
+import struct
+import subprocess
+import tempfile
+import logging
+from pathlib import Path
+from typing import Optional
+import numpy as np
+from flask import Flask, request, jsonify, send_file, Response
+from piper import PiperVoice
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("anime-tts")
+app = Flask(__name__)
+# ---- Voice Registry ----
+VOICES_DIR = Path("/app/voices")
+VOICE_MAP = {
+    "lessac": VOICES_DIR / "lessac.onnx",
+    "jenny": VOICES_DIR / "jenny.onnx",
+    "amy": VOICES_DIR / "amy.onnx",
+}
+# Cache loaded voices for speed
+_voice_cache: dict[str, PiperVoice] = {}
+def get_voice(name: str) -> PiperVoice:
+    """Load and cache a Piper voice."""
+    if name not in _voice_cache:
+        model_path = VOICE_MAP.get(name)
+        if not model_path or not model_path.exists():
+            raise ValueError(f"Voice '{name}' not found. Available: {list(VOICE_MAP.keys())}")
+        logger.info(f"Loading voice: {name} from {model_path}")
+        _voice_cache[name] = PiperVoice.load(str(model_path))
+        logger.info(f"Voice '{name}' loaded successfully")
+    return _voice_cache[name]
+def synthesize_speech(
+    text: str,
+    voice_name: str = "lessac",
+    speed: float = 1.0,
+    pitch_shift: int = 0,
+    output_format: str = "wav",
+) -> io.BytesIO:
+    """Synthesize text to speech and return audio bytes."""
+    voice = get_voice(voice_name)
+    # Synthesize to WAV in memory
+    wav_buffer = io.BytesIO()
+    # Piper uses length_scale for speed (inverse: lower = faster)
+    length_scale = 1.0 / max(0.25, min(speed, 4.0))
+    with wave.open(wav_buffer, "wb") as wav_file:
+        voice.synthesize(
+            text,
+            wav_file,
+            length_scale=length_scale,
+            sentence_silence=0.15,
+        )
+    wav_buffer.seek(0)
+    # Apply pitch shift if requested (for anime effect)
+    if pitch_shift != 0 or output_format == "mp3":
+        wav_buffer = post_process_audio(wav_buffer, pitch_shift, output_format)
+    return wav_buffer
+def post_process_audio(
+    wav_buffer: io.BytesIO,
+    pitch_shift: int = 0,
+    output_format: str = "wav",
+) -> io.BytesIO:
+    """Apply pitch shifting and format conversion using ffmpeg."""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
+        tmp_in.write(wav_buffer.read())
+        tmp_in_path = tmp_in.name
+    suffix = f".{output_format}"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_out:
+        tmp_out_path = tmp_out.name
+    try:
+        # Build ffmpeg command
+        cmd = ["ffmpeg", "-y", "-i", tmp_in_path]
+        filters = []
+        # Pitch shift using asetrate + aresample (fast method)
+        if pitch_shift != 0:
+            # Calculate rate multiplier from semitones
+            rate_mult = 2 ** (pitch_shift / 12.0)
+            # Read original sample rate
+            with wave.open(tmp_in_path, "rb") as wf:
+                orig_sr = wf.getframerate()
+            new_sr = int(orig_sr * rate_mult)
+            filters.append(f"asetrate={new_sr}")
+            filters.append(f"aresample={orig_sr}")
+            # Compensate tempo change from pitch shift
+            tempo = 1.0 / rate_mult
+            if 0.5 <= tempo <= 2.0:
+                filters.append(f"atempo={tempo}")
+            elif tempo < 0.5:
+                # Chain atempo filters for extreme values
+                filters.append(f"atempo=0.5,atempo={tempo/0.5}")
+        if filters:
+            cmd.extend(["-af", ",".join(filters)])
+        if output_format == "mp3":
+            cmd.extend(["-codec:a", "libmp3lame", "-q:a", "2"])
+        cmd.append(tmp_out_path)
+        result = subprocess.run(
+            cmd, capture_output=True, timeout=30
+        )
+        if result.returncode != 0:
+            logger.error(f"ffmpeg error: {result.stderr.decode()}")
+            # Fall back to original
+            wav_buffer.seek(0)
+            return wav_buffer
+        output_buffer = io.BytesIO()
+        with open(tmp_out_path, "rb") as f:
+            output_buffer.write(f.read())
+        output_buffer.seek(0)
+        return output_buffer
+    finally:
+        os.unlink(tmp_in_path)
+        if os.path.exists(tmp_out_path):
+            os.unlink(tmp_out_path)
+# ---- Pre-warm default voice on startup ----
+@app.before_request
+def _warmup():
+    """Lazy warmup — load default voice on first request."""
+    app.before_request_funcs[None].remove(_warmup)
+    try:
+        get_voice("lessac")
+    except Exception as e:
+        logger.warning(f"Warmup failed: {e}")
+# ---- API Routes ----
+@app.route("/health", methods=["GET"])
+def health():
+    return jsonify({"status": "ok", "engine": "piper-tts", "cached_voices": list(_voice_cache.keys())})
+@app.route("/voices", methods=["GET"])
+def list_voices():
+    voices = []
+    for name, path in VOICE_MAP.items():
+        voices.append({
+            "name": name,
+            "available": path.exists(),
+            "description": {
+                "lessac": "Bright expressive US female — anime-adjacent, great default",
+                "jenny": "Young bright British female — anime dub style",
+                "amy": "Clear US female — works well with pitch shift for anime effect",
+            }.get(name, ""),
+            "tip": "Try pitch_shift=2 or pitch_shift=3 for more anime-like sound",
+        })
+    return jsonify({"voices": voices})
+@app.route("/tts", methods=["POST"])
+def tts():
+    """Main TTS endpoint."""
+    start = time.time()
+    data = request.get_json(force=True, silent=True) or {}
+    text = data.get("text", "").strip()
+    if not text:
+        return jsonify({"error": "No text provided"}), 400
+    if len(text) > 10000:
+        return jsonify({"error": "Text too long (max 10000 chars)"}), 400
+    voice_name = data.get("voice", "lessac")
+    speed = float(data.get("speed", 1.0))
+    pitch_shift = int(data.get("pitch_shift", 0))
+    output_format = data.get("output_format", "wav").lower()
+    if output_format not in ("wav", "mp3"):
+        return jsonify({"error": "output_format must be 'wav' or 'mp3'"}), 400
+    if voice_name not in VOICE_MAP:
+        return jsonify({
+            "error": f"Unknown voice '{voice_name}'",
+            "available": list(VOICE_MAP.keys())
+        }), 400
+    try:
+        audio_buffer = synthesize_speech(
+            text=text,
+            voice_name=voice_name,
+            speed=speed,
+            pitch_shift=pitch_shift,
+            output_format=output_format,
+        )
+    except Exception as e:
+        logger.exception("Synthesis failed")
+        return jsonify({"error": str(e)}), 500
+    elapsed = time.time() - start
+    logger.info(f"TTS: {len(text)} chars, voice={voice_name}, speed={speed}, "
+                f"pitch={pitch_shift}, format={output_format}, time={elapsed:.3f}s")
+    mimetype = "audio/wav" if output_format == "wav" else "audio/mpeg"
+    return send_file(
+        audio_buffer,
+        mimetype=mimetype,
+        as_attachment=True,
+        download_name=f"speech.{output_format}",
+    )
+@app.route("/tts/batch", methods=["POST"])
+def tts_batch():
+    """Batch TTS — synthesize multiple texts."""
+    data = request.get_json(force=True, silent=True) or {}
+    texts = data.get("texts", [])
+    if not texts or not isinstance(texts, list):
+        return jsonify({"error": "Provide 'texts' as a list of strings"}), 400
+    voice_name = data.get("voice", "lessac")
+    speed = float(data.get("speed", 1.0))
+    pitch_shift = int(data.get("pitch_shift", 0))
+    # Concatenate all texts with pauses
+    combined = ". ".join(texts)
+    try:
+        audio_buffer = synthesize_speech(
+            text=combined,
+            voice_name=voice_name,
+            speed=speed,
+            pitch_shift=pitch_shift,
+        )
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+    return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="batch.wav")
+if __name__ == "__main__":
+    # Pre-load default voice
+    logger.info("Pre-loading default voice...")
+    try:
+        get_voice("lessac")
+        logger.info("Default voice ready!")
+    except Exception as e:
+        logger.error(f"Failed to pre-load voice: {e}")
+    app.run(host="0.0.0.0", port=5000, threaded=True)
+PYTHON_SERVER
+# Expose port
+EXPOSE 5000
+# Health check
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
+    CMD curl -f http://localhost:5000/health || exit 1
+# Run the server
+CMD ["python", "/app/server.py"]