CherithCutestory commited on
Commit
c615db3
Β·
1 Parent(s): d6d700f

Switched to a docker-based setup

Browse files
Files changed (6) hide show
  1. Dockerfile +51 -0
  2. README.md +7 -4
  3. app.py +64 -171
  4. packages.txt +0 -4
  5. postBuild +0 -3
  6. requirements.txt +0 -11
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------- base with CUDA runtime for T4 ----------
2
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
+
4
+ ENV DEBIAN_FRONTEND=noninteractive \
5
+ PYTHONUNBUFFERED=1 \
6
+ GRADIO_SERVER_NAME=0.0.0.0 \
7
+ GRADIO_SERVER_PORT=7860
8
+
9
+ # ---------- system packages ----------
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ python3 python3-dev python3-pip python3-venv \
12
+ espeak-ng \
13
+ build-essential \
14
+ libsndfile1 \
15
+ ffmpeg \
16
+ git \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # ---------- Python build tools ----------
20
+ RUN pip3 install --no-cache-dir --upgrade \
21
+ pip setuptools wheel Cython numpy
22
+
23
+ # ---------- PyTorch (CUDA 11.8, matches the base image) ----------
24
+ RUN pip3 install --no-cache-dir \
25
+ torch==2.1.0 torchaudio==2.1.0 \
26
+ --index-url https://download.pytorch.org/whl/cu118
27
+
28
+ # ---------- StyleTTS2 + Gradio ----------
29
+ RUN pip3 install --no-cache-dir styletts2 gradio
30
+
31
+ # ---------- NLTK data (StyleTTS2 uses punkt for sentence splitting) ---
32
+ RUN python3 -c "\
33
+ import nltk, os;\
34
+ os.makedirs('/usr/share/nltk_data', exist_ok=True);\
35
+ nltk.download('punkt', download_dir='/usr/share/nltk_data');\
36
+ nltk.download('punkt_tab', download_dir='/usr/share/nltk_data');\
37
+ "
38
+
39
+ # ---------- non-root user (HF Spaces requirement) ----------
40
+ RUN useradd -m -u 1000 user
41
+ USER user
42
+ ENV HOME=/home/user \
43
+ PATH="/home/user/.local/bin:${PATH}" \
44
+ NLTK_DATA=/usr/share/nltk_data
45
+
46
+ WORKDIR /home/user/app
47
+ COPY --chown=user:user . .
48
+
49
+ EXPOSE 7860
50
+ CMD ["python3", "app.py"]
51
+
README.md CHANGED
@@ -1,6 +1,9 @@
1
  ---
2
- title: VoxLibris - StyleTTS2
3
- sdk: gradio
4
- python_version: "3.11"
5
- app_file: app.py
 
 
 
6
  ---
 
1
  ---
2
+ title: StyleTTS2 Test
3
+ emoji: πŸ”Š
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
  ---
app.py CHANGED
@@ -1,184 +1,77 @@
1
- import os, sys, subprocess
 
 
 
2
 
3
- # ---- Fix OpenMP env issue on HF ----
4
- val = os.environ.get("OMP_NUM_THREADS", "1")
5
- try:
6
- val = str(int(val))
7
- except Exception:
8
- val = "1"
9
- os.environ["OMP_NUM_THREADS"] = val
10
-
11
- def ensure_styletts2():
12
- try:
13
- import styletts2 # noqa: F401
14
- return
15
- except ModuleNotFoundError:
16
- pass
17
-
18
- subprocess.check_call([
19
- sys.executable, "-m", "pip", "install", "--upgrade", "--no-cache-dir", "--no-deps", "styletts2==0.1.6"
20
- ])
21
-
22
- def import_styletts2_class():
23
- """
24
- styletts2 PyPI package doesn't export StyleTTS2 at top-level.
25
- Try a few known module locations and return the class/callable.
26
- """
27
- import importlib
28
 
29
- # Try common locations seen in forks / packaged builds
30
- candidates = [
31
- ("styletts2", "StyleTTS2"),
32
- ("styletts2.model", "StyleTTS2"),
33
- ("styletts2.styletts2", "StyleTTS2"),
34
- ("styletts2.api", "StyleTTS2"),
 
 
 
 
 
 
35
  ]
36
-
37
- for mod_name, attr in candidates:
38
  try:
39
  mod = importlib.import_module(mod_name)
40
- if hasattr(mod, attr):
41
- return getattr(mod, attr)
42
- except Exception:
43
- pass
44
-
45
- # If none worked, print what's actually inside and fail loudly
46
- import styletts2
47
- raise ImportError(
48
- "Could not locate StyleTTS2 class. "
49
- f"styletts2 package loaded from: {getattr(styletts2, '__file__', 'unknown')}. "
50
- f"Available attrs: {sorted([a for a in dir(styletts2) if not a.startswith('_')])}"
51
- )
52
-
53
- ensure_styletts2()
54
-
55
- import io
56
- import uuid
57
- import soundfile as sf
58
- import gradio as gr
59
- import torch
60
-
61
- StyleTTS2 = import_styletts2_class()
62
-
63
- # ---------------------------
64
- # Global config
65
- # ---------------------------
66
-
67
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
68
-
69
- print("βœ… VoxLibris StyleTTS2 Space starting...")
70
- print("Device:", DEVICE)
71
-
72
-
73
- # ---------------------------
74
- # Load model
75
- # ---------------------------
76
-
77
- def load_model():
78
- print("Loading StyleTTS2 model...")
79
- model = StyleTTS2(device=DEVICE)
80
- return model
81
-
82
-
83
- tts_model = load_model()
84
-
85
-
86
- # ---------------------------
87
- # TTS core function
88
- # ---------------------------
89
-
90
- def tts_generate(
91
- text: str,
92
- speaker_wav=None,
93
- speaker_transcript: str = "",
94
- speed: float = 1.0,
95
- pitch: float = 0.0,
96
- emotion: str = "neutral",
97
- seed: int = 0,
98
- ):
99
- """
100
- VoxLibris-compatible TTS API.
101
-
102
- Parameters:
103
- - text: required
104
- - speaker_wav: optional reference audio (voice cloning)
105
- - speaker_transcript: ignored (StyleTTS2 does not need it)
106
- - speed/pitch/emotion: accepted but mostly ignored
107
- - seed: supported for reproducibility
108
- """
109
-
110
- if not text or len(text.strip()) == 0:
111
- raise ValueError("Text cannot be empty.")
112
-
113
- if seed:
114
- torch.manual_seed(seed)
115
-
116
- print("Generating:", text[:80])
117
-
118
- # StyleTTS2 voice cloning support
119
- ref_audio_path = None
120
- if speaker_wav is not None:
121
- ref_audio_path = speaker_wav
122
-
123
- # Generate waveform
124
- wav, sr = tts_model.infer(
125
- text=text,
126
- ref_audio_path=ref_audio_path,
127
- )
128
-
129
- # Write MP3-like output as WAV (Gradio supports direct playback)
130
- tmp_name = f"/tmp/{uuid.uuid4().hex}.wav"
131
- sf.write(tmp_name, wav, sr)
132
-
133
- return tmp_name
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- # ---------------------------
137
- # Gradio UI + API
138
- # ---------------------------
139
 
140
  with gr.Blocks() as demo:
141
- gr.Markdown("# πŸ“– VoxLibris β€” StyleTTS2 API")
142
-
143
- inp_text = gr.Textbox(label="Text", lines=4)
144
-
145
- inp_voice = gr.Audio(
146
- label="Reference Voice WAV (optional)",
147
- type="filepath"
148
  )
 
 
149
 
150
- inp_transcript = gr.Textbox(
151
- label="Voice Transcript (optional, ignored)",
152
- value=""
153
- )
154
-
155
- inp_speed = gr.Slider(0.5, 1.5, value=1.0, label="Speed")
156
- inp_pitch = gr.Slider(-5, 5, value=0.0, label="Pitch (ignored)")
157
- inp_emotion = gr.Dropdown(
158
- ["neutral", "happy", "sad", "angry"],
159
- value="neutral",
160
- label="Emotion (ignored)",
161
- )
162
-
163
- inp_seed = gr.Number(value=0, label="Seed")
164
-
165
- out_audio = gr.Audio(label="Output Audio")
166
-
167
- btn = gr.Button("Generate")
168
-
169
- btn.click(
170
- fn=tts_generate,
171
- inputs=[
172
- inp_text,
173
- inp_voice,
174
- inp_transcript,
175
- inp_speed,
176
- inp_pitch,
177
- inp_emotion,
178
- inp_seed,
179
- ],
180
- outputs=out_audio,
181
- api_name="tts", # βœ… Consistent endpoint name
182
- )
183
 
184
  demo.launch()
 
1
+ """
2
+ Phase 1 – import & environment test.
3
+ If every line shows βœ… you are ready for Phase 2.
4
+ """
5
 
6
+ import importlib
7
+ import subprocess
8
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def run_diagnostics() -> str:
11
+ lines: list[str] = []
12
+
13
+ # ---- Python package imports ----
14
+ pkgs = [
15
+ ("torch", "PyTorch"),
16
+ ("torchaudio", "torchaudio"),
17
+ ("phonemizer", "phonemizer"),
18
+ ("munch", "munch"),
19
+ ("nltk", "NLTK"),
20
+ ("styletts2", "styletts2 (package)"),
21
+ ("styletts2.tts", "styletts2.tts (TTS class)"),
22
  ]
23
+ for mod_name, label in pkgs:
 
24
  try:
25
  mod = importlib.import_module(mod_name)
26
+ ver = getattr(mod, "__version__", "n/a")
27
+ lines.append(f"βœ… {label:30s} version {ver}")
28
+ except Exception as exc:
29
+ lines.append(f"❌ {label:30s} {exc}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # ---- CUDA ----
32
+ try:
33
+ import torch
34
+ if torch.cuda.is_available():
35
+ name = torch.cuda.get_device_name(0)
36
+ lines.append(f"βœ… CUDA device {name}")
37
+ else:
38
+ lines.append("⚠️ CUDA not available (CPU-only)")
39
+ except Exception as exc:
40
+ lines.append(f"❌ CUDA check failed {exc}")
41
+
42
+ # ---- espeak-ng binary ----
43
+ try:
44
+ r = subprocess.run(
45
+ ["espeak-ng", "--version"],
46
+ capture_output=True, text=True, timeout=5,
47
+ )
48
+ lines.append(f"βœ… espeak-ng {r.stdout.strip()}")
49
+ except FileNotFoundError:
50
+ lines.append("❌ espeak-ng binary not found")
51
+ except Exception as exc:
52
+ lines.append(f"❌ espeak-ng {exc}")
53
+
54
+ # ---- Quick model instantiation test ----
55
+ try:
56
+ from styletts2 import tts as stts
57
+ _engine = stts.StyleTTS2() # downloads weights on first run
58
+ lines.append("βœ… StyleTTS2 model loaded OK")
59
+ del _engine
60
+ except Exception as exc:
61
+ lines.append(f"❌ StyleTTS2 model load failed {exc}")
62
 
63
+ return "\n".join(lines)
 
 
64
 
65
  with gr.Blocks() as demo:
66
+ gr.Markdown("## StyleTTS2 β€” Environment Diagnostics")
67
+ output = gr.Textbox(
68
+ label="Results",
69
+ lines=18,
70
+ interactive=False,
 
 
71
  )
72
+ btn = gr.Button("Run diagnostics")
73
+ btn.click(fn=run_diagnostics, outputs=output)
74
 
75
+ demo.load(fn=run_diagnostics, outputs=output) # auto-run on page load
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  demo.launch()
packages.txt DELETED
@@ -1,4 +0,0 @@
1
- ffmpeg
2
- libsndfile1
3
- espeak-ng
4
- libespeak-ng1
 
 
 
 
 
postBuild DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euxo pipefail
3
- pip install --no-cache-dir --no-deps styletts2==0.1.6
 
 
 
 
requirements.txt DELETED
@@ -1,11 +0,0 @@
1
- gradio==6.6.0
2
-
3
- torch
4
- torchaudio
5
- numpy<2.0
6
- scipy
7
- soundfile
8
- ffmpeg-python
9
- cached-path
10
-
11
- huggingface-hub>=0.33.5,<2.0