Spaces:

CherithCutestory
/

styletts2

Paused

App Files Files Community

CherithCutestory commited on Feb 19

Commit

c615db3

1 Parent(s): d6d700f

Switched to a docker-based setup

Browse files

Files changed (6) hide show

Dockerfile +51 -0
README.md +7 -4
app.py +64 -171
packages.txt +0 -4
postBuild +0 -3
requirements.txt +0 -11

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+# ---------- base with CUDA runtime for T4 ----------
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860
+# ---------- system packages ----------
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-dev python3-pip python3-venv \
+    espeak-ng \
+    build-essential \
+    libsndfile1 \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# ---------- Python build tools ----------
+RUN pip3 install --no-cache-dir --upgrade \
+    pip setuptools wheel Cython numpy
+# ---------- PyTorch (CUDA 11.8, matches the base image) ----------
+RUN pip3 install --no-cache-dir \
+    torch==2.1.0 torchaudio==2.1.0 \
+    --index-url https://download.pytorch.org/whl/cu118
+# ---------- StyleTTS2 + Gradio ----------
+RUN pip3 install --no-cache-dir styletts2 gradio
+# ---------- NLTK data (StyleTTS2 uses punkt for sentence splitting) ---
+RUN python3 -c "\
+import nltk, os;\
+os.makedirs('/usr/share/nltk_data', exist_ok=True);\
+nltk.download('punkt',     download_dir='/usr/share/nltk_data');\
+nltk.download('punkt_tab', download_dir='/usr/share/nltk_data');\
+"
+# ---------- non-root user (HF Spaces requirement) ----------
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH="/home/user/.local/bin:${PATH}" \
+    NLTK_DATA=/usr/share/nltk_data
+WORKDIR /home/user/app
+COPY --chown=user:user . .
+EXPOSE 7860
+CMD ["python3", "app.py"]

README.md CHANGED Viewed

@@ -1,6 +1,9 @@
 ---
-title: VoxLibris - StyleTTS2
-sdk: gradio
-python_version: "3.11"
-app_file: app.py
 ---

 ---
+title: StyleTTS2 Test
+emoji: 🔊
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_port: 7860
+pinned: false
 ---

app.py CHANGED Viewed

@@ -1,184 +1,77 @@
-import os, sys, subprocess
-# ---- Fix OpenMP env issue on HF ----
-val = os.environ.get("OMP_NUM_THREADS", "1")
-try:
-    val = str(int(val))
-except Exception:
-    val = "1"
-os.environ["OMP_NUM_THREADS"] = val
-def ensure_styletts2():
-    try:
-        import styletts2  # noqa: F401
-        return
-    except ModuleNotFoundError:
-        pass
-    subprocess.check_call([
-        sys.executable, "-m", "pip", "install", "--upgrade", "--no-cache-dir", "--no-deps", "styletts2==0.1.6"
-    ])
-def import_styletts2_class():
-    """
-    styletts2 PyPI package doesn't export StyleTTS2 at top-level.
-    Try a few known module locations and return the class/callable.
-    """
-    import importlib
-    # Try common locations seen in forks / packaged builds
-    candidates = [
-        ("styletts2", "StyleTTS2"),
-        ("styletts2.model", "StyleTTS2"),
-        ("styletts2.styletts2", "StyleTTS2"),
-        ("styletts2.api", "StyleTTS2"),
     ]
-    for mod_name, attr in candidates:
         try:
             mod = importlib.import_module(mod_name)
-            if hasattr(mod, attr):
-                return getattr(mod, attr)
-        except Exception:
-            pass
-    # If none worked, print what's actually inside and fail loudly
-    import styletts2
-    raise ImportError(
-        "Could not locate StyleTTS2 class. "
-        f"styletts2 package loaded from: {getattr(styletts2, '__file__', 'unknown')}. "
-        f"Available attrs: {sorted([a for a in dir(styletts2) if not a.startswith('_')])}"
-    )
-ensure_styletts2()
-import io
-import uuid
-import soundfile as sf
-import gradio as gr
-import torch
-StyleTTS2 = import_styletts2_class()
-# ---------------------------
-# Global config
-# ---------------------------
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print("✅ VoxLibris StyleTTS2 Space starting...")
-print("Device:", DEVICE)
-# ---------------------------
-# Load model
-# ---------------------------
-def load_model():
-    print("Loading StyleTTS2 model...")
-    model = StyleTTS2(device=DEVICE)
-    return model
-tts_model = load_model()
-# ---------------------------
-# TTS core function
-# ---------------------------
-def tts_generate(
-    text: str,
-    speaker_wav=None,
-    speaker_transcript: str = "",
-    speed: float = 1.0,
-    pitch: float = 0.0,
-    emotion: str = "neutral",
-    seed: int = 0,
-):
-    """
-    VoxLibris-compatible TTS API.
-    Parameters:
-      - text: required
-      - speaker_wav: optional reference audio (voice cloning)
-      - speaker_transcript: ignored (StyleTTS2 does not need it)
-      - speed/pitch/emotion: accepted but mostly ignored
-      - seed: supported for reproducibility
-    """
-    if not text or len(text.strip()) == 0:
-        raise ValueError("Text cannot be empty.")
-    if seed:
-        torch.manual_seed(seed)
-    print("Generating:", text[:80])
-    # StyleTTS2 voice cloning support
-    ref_audio_path = None
-    if speaker_wav is not None:
-        ref_audio_path = speaker_wav
-    # Generate waveform
-    wav, sr = tts_model.infer(
-        text=text,
-        ref_audio_path=ref_audio_path,
-    )
-    # Write MP3-like output as WAV (Gradio supports direct playback)
-    tmp_name = f"/tmp/{uuid.uuid4().hex}.wav"
-    sf.write(tmp_name, wav, sr)
-    return tmp_name
-# ---------------------------
-# Gradio UI + API
-# ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 📖 VoxLibris — StyleTTS2 API")
-    inp_text = gr.Textbox(label="Text", lines=4)
-    inp_voice = gr.Audio(
-        label="Reference Voice WAV (optional)",
-        type="filepath"
     )
-    inp_transcript = gr.Textbox(
-        label="Voice Transcript (optional, ignored)",
-        value=""
-    )
-    inp_speed = gr.Slider(0.5, 1.5, value=1.0, label="Speed")
-    inp_pitch = gr.Slider(-5, 5, value=0.0, label="Pitch (ignored)")
-    inp_emotion = gr.Dropdown(
-        ["neutral", "happy", "sad", "angry"],
-        value="neutral",
-        label="Emotion (ignored)",
-    )
-    inp_seed = gr.Number(value=0, label="Seed")
-    out_audio = gr.Audio(label="Output Audio")
-    btn = gr.Button("Generate")
-    btn.click(
-        fn=tts_generate,
-        inputs=[
-            inp_text,
-            inp_voice,
-            inp_transcript,
-            inp_speed,
-            inp_pitch,
-            inp_emotion,
-            inp_seed,
-        ],
-        outputs=out_audio,
-        api_name="tts",   # ✅ Consistent endpoint name
-    )
 demo.launch()

+"""
+Phase 1 – import & environment test.
+If every line shows ✅ you are ready for Phase 2.
+"""
+import importlib
+import subprocess
+import gradio as gr
+def run_diagnostics() -> str:
+    lines: list[str] = []
+    # ---- Python package imports ----
+    pkgs = [
+        ("torch",          "PyTorch"),
+        ("torchaudio",     "torchaudio"),
+        ("phonemizer",     "phonemizer"),
+        ("munch",          "munch"),
+        ("nltk",           "NLTK"),
+        ("styletts2",      "styletts2 (package)"),
+        ("styletts2.tts",  "styletts2.tts (TTS class)"),
     ]
+    for mod_name, label in pkgs:
         try:
             mod = importlib.import_module(mod_name)
+            ver = getattr(mod, "__version__", "n/a")
+            lines.append(f"✅  {label:30s}  version {ver}")
+        except Exception as exc:
+            lines.append(f"❌  {label:30s}  {exc}")
+    # ---- CUDA ----
+    try:
+        import torch
+        if torch.cuda.is_available():
+            name = torch.cuda.get_device_name(0)
+            lines.append(f"✅  CUDA device                     {name}")
+        else:
+            lines.append("⚠️  CUDA not available (CPU-only)")
+    except Exception as exc:
+        lines.append(f"❌  CUDA check failed               {exc}")
+    # ---- espeak-ng binary ----
+    try:
+        r = subprocess.run(
+            ["espeak-ng", "--version"],
+            capture_output=True, text=True, timeout=5,
+        )
+        lines.append(f"✅  espeak-ng                        {r.stdout.strip()}")
+    except FileNotFoundError:
+        lines.append("❌  espeak-ng                        binary not found")
+    except Exception as exc:
+        lines.append(f"❌  espeak-ng                        {exc}")
+    # ---- Quick model instantiation test ----
+    try:
+        from styletts2 import tts as stts
+        _engine = stts.StyleTTS2()          # downloads weights on first run
+        lines.append("✅  StyleTTS2 model loaded OK")
+        del _engine
+    except Exception as exc:
+        lines.append(f"❌  StyleTTS2 model load failed      {exc}")
+    return "\n".join(lines)
 with gr.Blocks() as demo:
+    gr.Markdown("## StyleTTS2 — Environment Diagnostics")
+    output = gr.Textbox(
+        label="Results",
+        lines=18,
+        interactive=False,
     )
+    btn = gr.Button("Run diagnostics")
+    btn.click(fn=run_diagnostics, outputs=output)
+    demo.load(fn=run_diagnostics, outputs=output)   # auto-run on page load
 demo.launch()

packages.txt DELETED Viewed

@@ -1,4 +0,0 @@
-ffmpeg
-libsndfile1
-espeak-ng
-libespeak-ng1

postBuild DELETED Viewed

@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-pip install --no-cache-dir --no-deps styletts2==0.1.6

requirements.txt DELETED Viewed

@@ -1,11 +0,0 @@
-gradio==6.6.0
-torch
-torchaudio
-numpy<2.0
-scipy
-soundfile
-ffmpeg-python
-cached-path
-huggingface-hub>=0.33.5,<2.0