RiishabhSinghal commited on
Commit
1fc8196
Β·
2 Parent(s): 21409cb6eaf50d

Merge remote main with local XTTS app

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +575 -0
  4. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  # XTTS Voice Clone Starter (Windows)
2
 
3
  This project gives you a fast setup to clone a voice using **Coqui XTTS v2**.
@@ -60,3 +61,16 @@ Full fine-tuning exists but is heavier (GPU VRAM, dataset, longer runs). Start w
60
  - If model download is slow/fails, retry with stable internet.
61
  - If you hit out-of-memory errors, close GPU-heavy apps or run on CPU.
62
  - If output sounds noisy, improve reference quality first.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # XTTS Voice Clone Starter (Windows)
3
 
4
  This project gives you a fast setup to clone a voice using **Coqui XTTS v2**.
 
61
  - If model download is slow/fails, retry with stable internet.
62
  - If you hit out-of-memory errors, close GPU-heavy apps or run on CPU.
63
  - If output sounds noisy, improve reference quality first.
64
+ =======
65
+ ---
66
+ title: Chronis TTS
67
+ emoji: πŸŽ™
68
+ colorFrom: gray
69
+ colorTo: gray
70
+ sdk: gradio
71
+ sdk_version: 5.23.0
72
+ app_file: app.py
73
+ pinned: false
74
+ python_version: "3.10"
75
+ ---
76
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  os.environ["PYTHONUTF8"] = "1"
4
  os.environ["PYTHONIOENCODING"] = "utf-8"
@@ -9,10 +10,17 @@ sys.stderr.reconfigure(encoding="utf-8")
9
 
10
  import re
11
  import gc
 
 
 
 
 
 
12
  import tempfile
13
  import subprocess
14
  import shutil
15
  import threading
 
16
  from pathlib import Path
17
 
18
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -85,6 +93,366 @@ def get_tts():
85
  )
86
  print("[tts] Model loaded βœ“", flush=True)
87
  return _tts_instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
 
90
  # ──────────────────────────────────────────────────────────────────────────────
@@ -99,6 +467,7 @@ def clean_text(text: str) -> str:
99
  return text[:500]
100
 
101
 
 
102
  def split_sentences(text: str, max_chars: int = 200) -> list[str]:
103
  """
104
  XTTS handles longer segments better than Fish Speech, so we use a
@@ -107,6 +476,12 @@ def split_sentences(text: str, max_chars: int = 200) -> list[str]:
107
  parts = re.split(r"(?<=[.!?])\s+", text)
108
  chunks: list[str] = []
109
  buf = ""
 
 
 
 
 
 
110
  for p in parts:
111
  if len(buf) + len(p) < max_chars:
112
  buf = (buf + " " + p).strip()
@@ -125,6 +500,7 @@ def split_sentences(text: str, max_chars: int = 200) -> list[str]:
125
 
126
  def prepare_ref_audio(ref_path: str) -> str:
127
  """
 
128
  Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
129
 
130
  XTTS-v2 expects 24 kHz input for its speaker encoder.
@@ -225,10 +601,185 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
225
  os.unlink(tmp_out)
226
  except OSError:
227
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  shutil.rmtree(workdir, ignore_errors=True)
229
 
230
 
231
  # ──────────────────────────────────────────────────────────────────────────────
 
232
  # Gradio UI (same contract as the Fish Speech version)
233
  # ──────────────────────────────────────────────────────────────────────────────
234
 
@@ -252,3 +803,27 @@ demo = gr.Interface(
252
  demo.queue()
253
  demo.launch(server_name="0.0.0.0", server_port=7860)
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ <<<<<<< HEAD
3
 
4
  os.environ["PYTHONUTF8"] = "1"
5
  os.environ["PYTHONIOENCODING"] = "utf-8"
 
10
 
11
  import re
12
  import gc
13
+ =======
14
+ import sys
15
+ import re
16
+ import gc
17
+ import base64
18
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
19
  import tempfile
20
  import subprocess
21
  import shutil
22
  import threading
23
+ <<<<<<< HEAD
24
  from pathlib import Path
25
 
26
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
93
  )
94
  print("[tts] Model loaded βœ“", flush=True)
95
  return _tts_instance
96
+ =======
97
+
98
+ try:
99
+ import tomllib
100
+ except ModuleNotFoundError:
101
+ try:
102
+ import tomli as tomllib
103
+ except ModuleNotFoundError:
104
+ tomllib = None
105
+
106
+ try:
107
+ import tomli_w
108
+ except ModuleNotFoundError:
109
+ tomli_w = None
110
+
111
+ from pathlib import Path
112
+
113
+ os.environ["GRADIO_SSR_MODE"] = "0"
114
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
115
+ os.environ["OMP_NUM_THREADS"] = str(os.cpu_count() or 1)
116
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
117
+
118
+ import gradio as gr
119
+ from pydub import AudioSegment
120
+ from huggingface_hub import snapshot_download
121
+
122
+ SECRET = os.environ.get("API_SECRET", "")
123
+ REPO_DIR = Path("/tmp/fish-speech")
124
+ MODEL_DIR = Path("/tmp/fish-speech-weights")
125
+
126
+ inference_lock = threading.Lock()
127
+ initialized = False
128
+
129
+ print("=== Chronis Fish Speech Space Booting ===", flush=True)
130
+
131
+
132
+ # ──────────────────────────────────────────────────────────────────────────────
133
+ # Patch 1 β€” LogMelSpectrogram
134
+ #
135
+ # History of bugs fixed in this class:
136
+ #
137
+ # Round 1 β€” AttributeError: 'LogMelSpectrogram' has no attribute 'hop_length'
138
+ # firefly.py reads self.spec_transform.hop_length (and n_mels, n_fft, etc.)
139
+ # directly on the object. They were only stored inside self._transform.
140
+ # Fix: expose every __init__ param as a top-level self.* attribute.
141
+ #
142
+ # Round 2 (current) β€” RuntimeError: size of tensor a (1292) must match b (160)
143
+ # at non-singleton dimension 3
144
+ #
145
+ # Root cause A β€” wrong input shape β†’ 4-D output:
146
+ # vqgan/inference.py loads audio with torchaudio.load() β†’ (C, T),
147
+ # then passes it as (1, C, T) = (1, 1, T) to model.encode().
148
+ # firefly.encode() calls self.spec_transform(audios) with a 3-D tensor.
149
+ # T.MelSpectrogram treats every dim except the last as a batch dim,
150
+ # so (B=1, C=1, T) β†’ output (B=1, C=1, n_mels, T_frames) [4-D].
151
+ # Downstream masks are computed as 3-D (B, 1, T_vq).
152
+ # PyTorch broadcasting aligns from the right:
153
+ # mels: (1, 1, 160, 1292) dim-3 = 1292
154
+ # mel_masks_conv: (1, 1, 1, 160) dim-3 = 160
155
+ # β†’ "size of tensor a (1292) must match b (160) at non-singleton dim 3"
156
+ # Fix: squeeze the channel dim inside forward() so output is always 3-D.
157
+ #
158
+ # Root cause B β€” wrong default hyperparameters:
159
+ # The "21hz" in firefly-gan-vq-fsq-8x1024-21hz encodes the token rate:
160
+ # 44100 / (hop_length Γ— 8_conv_strides) β‰ˆ 21 β†’ hop_length = 256
161
+ # n_mels is 160 for fish-speech, not 128.
162
+ # Hydra injects the correct values via __init__ kwargs, but using the
163
+ # right defaults prevents silent fallback failures.
164
+ # ──────────────────────────────────────────────────────────────────────────────
165
+ SPECTROGRAM_SRC = '''\
166
+ """
167
+ fish_speech.utils.spectrogram β€” patched by Chronis setup.
168
+ See app.py Patch 1 comment block for the full explanation of fixes.
169
+ """
170
+ import torch
171
+ import torch.nn as nn
172
+ import torchaudio.transforms as T
173
+
174
+
175
+ class LogMelSpectrogram(nn.Module):
176
+ def __init__(
177
+ self,
178
+ sample_rate: int = 44100,
179
+ n_fft: int = 1024,
180
+ hop_length: int = 256,
181
+ win_length: int = 1024,
182
+ n_mels: int = 160,
183
+ f_min: float = 0.0,
184
+ f_max: float = None,
185
+ center: bool = True,
186
+ power: float = 1.0,
187
+ norm: str = None,
188
+ mel_scale: str = "slaney",
189
+ clamp_min: float = 1e-5,
190
+ ):
191
+ super().__init__()
192
+
193
+ # Every param must be a direct instance attribute.
194
+ # firefly.py reads them as self.spec_transform.<attr>.
195
+ self.sample_rate = sample_rate
196
+ self.n_fft = n_fft
197
+ self.hop_length = hop_length
198
+ self.win_length = win_length
199
+ self.n_mels = n_mels
200
+ self.f_min = f_min
201
+ self.f_max = f_max if f_max is not None else float(sample_rate) / 2.0
202
+ self.clamp_min = clamp_min
203
+
204
+ self._transform = T.MelSpectrogram(
205
+ sample_rate = sample_rate,
206
+ n_fft = n_fft,
207
+ hop_length = hop_length,
208
+ win_length = win_length,
209
+ n_mels = n_mels,
210
+ f_min = f_min,
211
+ f_max = self.f_max,
212
+ center = center,
213
+ power = power,
214
+ norm = norm,
215
+ mel_scale = mel_scale,
216
+ )
217
+
218
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
219
+ """
220
+ x : (B, T) | (T,) | (B, 1, T) | (B, C, T)
221
+ out : (B, n_mels, T_frames) β€” always 3-D, never 4-D
222
+
223
+ The channel-squeeze is critical. vqgan/inference.py passes audio as
224
+ (B=1, C=1, T); without the squeeze T.MelSpectrogram returns a 4-D
225
+ tensor which mismatches the 3-D conv mask, crashing at dim 3.
226
+ """
227
+ if x.ndim == 3:
228
+ if x.shape[1] == 1:
229
+ x = x.squeeze(1) # mono (B, 1, T) β†’ (B, T)
230
+ else:
231
+ x = x.mean(dim=1) # stereo (B, C, T) β†’ (B, T)
232
+ mel = self._transform(x)
233
+ return torch.log(torch.clamp(mel, min=self.clamp_min))
234
+ '''
235
+
236
+
237
+ def _patch_spectrogram_module():
238
+ utils_dir = REPO_DIR / "fish_speech" / "utils"
239
+ utils_dir.mkdir(parents=True, exist_ok=True)
240
+
241
+ init_file = utils_dir / "__init__.py"
242
+ if not init_file.exists():
243
+ init_file.write_text("# auto-generated by Chronis setup\n")
244
+
245
+ spec_file = utils_dir / "spectrogram.py"
246
+ spec_file.write_text(SPECTROGRAM_SRC)
247
+
248
+ # Delete any stale .pyc that could shadow the updated .py
249
+ pyc_dir = utils_dir / "__pycache__"
250
+ if pyc_dir.exists():
251
+ for pyc in pyc_dir.glob("spectrogram*.pyc"):
252
+ pyc.unlink()
253
+ print(f"[patch] deleted stale {pyc}", flush=True)
254
+
255
+ print(f"[patch] wrote {spec_file}", flush=True)
256
+
257
+
258
+ # ──────────────────────────────────────────────────────────────────────────────
259
+ # Patch 2 β€” strip pyaudio from all dependency manifests
260
+ # ──────────────────────────────────────────────────────────────────────────────
261
+ def _drop_dep(dep_list: list, pattern: str) -> list:
262
+ return [d for d in dep_list if not d.lower().startswith(pattern)]
263
+
264
+
265
+ def _patch_pyproject_toml():
266
+ pyproject = REPO_DIR / "pyproject.toml"
267
+ if not pyproject.exists():
268
+ return
269
+
270
+ with open(pyproject, "rb") as f:
271
+ data = tomllib.load(f)
272
+
273
+ changed = False
274
+ deps = data.get("project", {}).get("dependencies", [])
275
+ if deps:
276
+ new_deps = _drop_dep(deps, "pyaudio")
277
+ if new_deps != deps:
278
+ data["project"]["dependencies"] = new_deps
279
+ changed = True
280
+
281
+ poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {})
282
+ if "pyaudio" in poetry_deps or "PyAudio" in poetry_deps:
283
+ poetry_deps.pop("pyaudio", None)
284
+ poetry_deps.pop("PyAudio", None)
285
+ changed = True
286
+
287
+ if changed:
288
+ with open(pyproject, "wb") as f:
289
+ tomli_w.dump(data, f)
290
+ print("[patch] removed pyaudio from pyproject.toml", flush=True)
291
+
292
+
293
+ def _patch_requirements_txt():
294
+ for fname in ("requirements.txt", "requirements-base.txt"):
295
+ req = REPO_DIR / fname
296
+ if not req.exists():
297
+ continue
298
+ lines = req.read_text().splitlines()
299
+ new_lines = [l for l in lines if not l.lower().startswith("pyaudio")]
300
+ if new_lines != lines:
301
+ req.write_text("\n".join(new_lines) + "\n")
302
+ print(f"[patch] removed pyaudio from {fname}", flush=True)
303
+
304
+
305
+ def _patch_setup_cfg():
306
+ setup_cfg = REPO_DIR / "setup.cfg"
307
+ if not setup_cfg.exists():
308
+ return
309
+ text = setup_cfg.read_text()
310
+ new_text = "\n".join(
311
+ l for l in text.splitlines() if not l.strip().lower().startswith("pyaudio")
312
+ )
313
+ if new_text != text:
314
+ setup_cfg.write_text(new_text)
315
+ print("[patch] removed pyaudio from setup.cfg", flush=True)
316
+
317
+
318
+ def _patch_dependencies():
319
+ global tomllib, tomli_w
320
+ if tomllib is None or tomli_w is None:
321
+ subprocess.run(
322
+ [sys.executable, "-m", "pip", "install", "tomli", "tomli_w", "-q"],
323
+ check=True,
324
+ )
325
+ import tomli as tomllib
326
+ import tomli_w as tomli_w
327
+
328
+ _patch_pyproject_toml()
329
+ _patch_requirements_txt()
330
+ _patch_setup_cfg()
331
+
332
+
333
+ # ──────────────────────────────────────────────────────────────────────────────
334
+ # Patch 3 β€” CPU-safe subprocess wrapper
335
+ # ──────────────────────────────────────────────────────────────────────────────
336
+ WRAPPER_PATH = Path("/tmp/_chronis_torch_cpu.py")
337
+
338
+ _WRAPPER_SRC = '''\
339
+ """
340
+ Chronis CPU-safe subprocess wrapper.
341
+ Forces torch.load β†’ CPU, disables weights_only, redirects .to(cuda) β†’ .to(cpu).
342
+ Usage: python _chronis_torch_cpu.py <real_script.py> [args...]
343
+ """
344
+ import sys
345
+ import torch
346
+ import runpy
347
+
348
+ _original_load = torch.load
349
+
350
+ def _cpu_safe_load(f, map_location=None, pickle_module=None, **kwargs):
351
+ kwargs["weights_only"] = False
352
+ kwargs["map_location"] = "cpu"
353
+ if pickle_module is not None:
354
+ kwargs["pickle_module"] = pickle_module
355
+ return _original_load(f, **kwargs)
356
+
357
+ torch.load = _cpu_safe_load
358
+
359
+ _orig_module_to = torch.nn.Module.to
360
+ def _cpu_module_to(self, *args, **kwargs):
361
+ new_args = []
362
+ for a in args:
363
+ if isinstance(a, (str, torch.device)) and "cuda" in str(a):
364
+ a = torch.device("cpu")
365
+ new_args.append(a)
366
+ if "device" in kwargs and "cuda" in str(kwargs["device"]):
367
+ kwargs["device"] = torch.device("cpu")
368
+ return _orig_module_to(self, *new_args, **kwargs)
369
+ torch.nn.Module.to = _cpu_module_to
370
+
371
+ _orig_tensor_to = torch.Tensor.to
372
+ def _cpu_tensor_to(self, *args, **kwargs):
373
+ new_args = []
374
+ for a in args:
375
+ if isinstance(a, (str, torch.device)) and "cuda" in str(a):
376
+ a = torch.device("cpu")
377
+ new_args.append(a)
378
+ if "device" in kwargs and "cuda" in str(kwargs["device"]):
379
+ kwargs["device"] = torch.device("cpu")
380
+ return _orig_tensor_to(self, *new_args, **kwargs)
381
+ torch.Tensor.to = _cpu_tensor_to
382
+
383
+ sys.argv = sys.argv[1:]
384
+ runpy.run_path(sys.argv[0], run_name="__main__")
385
+ '''
386
+
387
+
388
+ def _patch_torch_load():
389
+ WRAPPER_PATH.write_text(_WRAPPER_SRC)
390
+ print(f"[patch] wrote subprocess wrapper β†’ {WRAPPER_PATH}", flush=True)
391
+
392
+
393
+ # ──────────────────────────────────────────────────────────────────────────────
394
+
395
+ def _build_env():
396
+ existing = os.environ.get("PYTHONPATH", "")
397
+ new_pythonpath = f"{REPO_DIR}:{existing}" if existing else str(REPO_DIR)
398
+ return {
399
+ **os.environ,
400
+ "PYTHONPATH": new_pythonpath,
401
+ "HYDRA_FULL_ERROR": "1",
402
+ "CUDA_VISIBLE_DEVICES": "",
403
+ }
404
+
405
+
406
+ # ──────────────────────────────────────────────────────────────────────────────
407
+ # Setup
408
+ # ──────────────────────────────────────────────────────────────────────────────
409
+
410
+ def setup():
411
+ global initialized
412
+ if initialized:
413
+ return
414
+
415
+ if not REPO_DIR.exists():
416
+ print("Cloning Fish Speech v1.5.0 ...", flush=True)
417
+ subprocess.run(
418
+ [
419
+ "git", "clone",
420
+ "--depth", "1",
421
+ "--branch", "v1.5.0",
422
+ "https://github.com/fishaudio/fish-speech.git",
423
+ str(REPO_DIR),
424
+ ],
425
+ check=True,
426
+ )
427
+
428
+ _patch_spectrogram_module()
429
+ _patch_dependencies()
430
+ _patch_torch_load()
431
+
432
+ print("Installing Fish Speech (editable) ...", flush=True)
433
+ subprocess.run(
434
+ [sys.executable, "-m", "pip", "install", "-e", ".", "--quiet"],
435
+ cwd=str(REPO_DIR),
436
+ check=True,
437
+ )
438
+
439
+ # Re-apply AFTER pip install β€” editable install can cache stale .pyc files
440
+ _patch_spectrogram_module()
441
+
442
+ if str(REPO_DIR) not in sys.path:
443
+ sys.path.insert(0, str(REPO_DIR))
444
+
445
+ if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
446
+ print("Downloading Fish Speech 1.5 weights ...", flush=True)
447
+ snapshot_download(
448
+ repo_id = "fishaudio/fish-speech-1.5",
449
+ local_dir = str(MODEL_DIR),
450
+ local_dir_use_symlinks = False,
451
+ )
452
+
453
+ print("Setup complete.", flush=True)
454
+ initialized = True
455
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
456
 
457
 
458
  # ──────────────────────────────────────────────────────────────────────────────
 
467
  return text[:500]
468
 
469
 
470
+ <<<<<<< HEAD
471
  def split_sentences(text: str, max_chars: int = 200) -> list[str]:
472
  """
473
  XTTS handles longer segments better than Fish Speech, so we use a
 
476
  parts = re.split(r"(?<=[.!?])\s+", text)
477
  chunks: list[str] = []
478
  buf = ""
479
+ =======
480
+ def split_sentences(text: str, max_chars: int = 120) -> list:
481
+ parts = re.split(r"(?<=[.!?])\s+", text)
482
+ chunks = []
483
+ buf = ""
484
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
485
  for p in parts:
486
  if len(buf) + len(p) < max_chars:
487
  buf = (buf + " " + p).strip()
 
500
 
501
  def prepare_ref_audio(ref_path: str) -> str:
502
  """
503
+ <<<<<<< HEAD
504
  Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
505
 
506
  XTTS-v2 expects 24 kHz input for its speaker encoder.
 
601
  os.unlink(tmp_out)
602
  except OSError:
603
  pass
604
+ =======
605
+ Normalise to mono 44100 Hz WAV, capped at 8 seconds.
606
+
607
+ Fish Speech docs recommend 3-10 s of reference. We cap at 8 s:
608
+ - Short enough to keep CPU encode time reasonable
609
+ - Long enough for good speaker characterisation
610
+ - Avoids edge-case rounding in the conv-mask stride at 15 s lengths
611
+ """
612
+ audio = AudioSegment.from_file(ref_path)
613
+ audio = audio.set_channels(1).set_frame_rate(44100).normalize()
614
+
615
+ if len(audio) > 8_000:
616
+ audio = audio[:8_000]
617
+ elif len(audio) < 1_000:
618
+ raise ValueError(
619
+ f"Reference audio too short ({len(audio)}ms). Need at least 1 second."
620
+ )
621
+
622
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
623
+ audio.export(tmp.name, format="wav")
624
+ return tmp.name
625
+
626
+
627
+ # ──────────────────────────────────────────────────────────────────────────────
628
+ # Inference pipeline
629
+ # ──────────────────────────────────────────────────────────────────────────────
630
+
631
+ def run_step(cmd: list, name: str, cwd: Path, expect_output: Path = None):
632
+ """
633
+ Run a Fish Speech subprocess through the CPU wrapper.
634
+ Raises a detailed RuntimeError on non-zero exit or missing expected output.
635
+ """
636
+ print(f"[{name}] starting ...", flush=True)
637
+ wrapped_cmd = [cmd[0], str(WRAPPER_PATH)] + cmd[1:]
638
+
639
+ result = subprocess.run(
640
+ wrapped_cmd,
641
+ cwd = str(cwd),
642
+ capture_output = True,
643
+ text = True,
644
+ env = _build_env(),
645
+ timeout = 600,
646
+ )
647
+
648
+ if result.stdout.strip():
649
+ print(f"[{name}] stdout:\n{result.stdout[-1200:]}", flush=True)
650
+
651
+ if result.returncode != 0:
652
+ diag = (
653
+ f"[{name}] FAILED (exit {result.returncode})\n"
654
+ f"--- stderr ---\n{result.stderr[-1500:]}\n"
655
+ f"--- stdout ---\n{result.stdout[-600:]}"
656
+ )
657
+ print(diag, flush=True)
658
+ raise RuntimeError(diag)
659
+
660
+ if expect_output is not None and not expect_output.exists():
661
+ raise RuntimeError(
662
+ f"[{name}] exited 0 but expected output missing: {expect_output}\n"
663
+ f"stdout: {result.stdout[-800:]}\nstderr: {result.stderr[-800:]}"
664
+ )
665
+
666
+ print(f"[{name}] done βœ“", flush=True)
667
+
668
+
669
+ def run_chunk(text: str, ref_audio: str, workdir: Path, idx: int) -> str:
670
+ chunk_dir = workdir / f"chunk_{idx}"
671
+ chunk_dir.mkdir(parents=True, exist_ok=True)
672
+
673
+ ref_copy = chunk_dir / "ref.wav"
674
+ shutil.copy(ref_audio, ref_copy)
675
+
676
+ vq_tokens = chunk_dir / "fake.npy"
677
+ sem_tokens = chunk_dir / "codes_0.npy"
678
+ out_wav = chunk_dir / "fake.wav"
679
+
680
+ # In fish-speech v1.5, tools/vqgan/inference.py handles BOTH encode and
681
+ # decode. Mode is auto-detected from the input file extension:
682
+ # .wav β†’ encode β†’ writes fake.npy
683
+ # .npy β†’ decode β†’ writes fake.wav
684
+ vqgan_script = str(REPO_DIR / "tools" / "vqgan" / "inference.py")
685
+ t2s_script = str(REPO_DIR / "fish_speech" / "models" / "text2semantic" / "inference.py")
686
+ firefly_ckpt = str(MODEL_DIR / "firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
687
+
688
+ # Step 1: Reference audio β†’ VQ tokens
689
+ run_step(
690
+ [
691
+ sys.executable, vqgan_script,
692
+ "-i", str(ref_copy),
693
+ "--checkpoint-path", firefly_ckpt,
694
+ "--device", "cpu",
695
+ ],
696
+ name = "Codec Encode",
697
+ cwd = chunk_dir,
698
+ expect_output = vq_tokens,
699
+ )
700
+
701
+ # Step 2: Text + VQ tokens β†’ semantic codes
702
+ run_step(
703
+ [
704
+ sys.executable, t2s_script,
705
+ "--text", text,
706
+ "--prompt-tokens", str(vq_tokens),
707
+ "--checkpoint-path", str(MODEL_DIR),
708
+ "--num-samples", "1",
709
+ "--device", "cpu",
710
+ ],
711
+ name = "Text2Semantic",
712
+ cwd = chunk_dir,
713
+ expect_output = sem_tokens,
714
+ )
715
+
716
+ # Step 3: Semantic codes β†’ audio
717
+ run_step(
718
+ [
719
+ sys.executable, vqgan_script,
720
+ "-i", str(sem_tokens),
721
+ "--checkpoint-path", firefly_ckpt,
722
+ "--device", "cpu",
723
+ ],
724
+ name = "Codec Decode",
725
+ cwd = chunk_dir,
726
+ expect_output = out_wav,
727
+ )
728
+
729
+ return str(out_wav)
730
+
731
+
732
+ # ──────────────────────────────────────────────────────────────────────────────
733
+ # Main synthesis entry point
734
+ # ──────────────────────────────────────────────────────────────────────────────
735
+
736
+ def synthesize(text: str, ref_audio_path: str, secret: str):
737
+ with inference_lock:
738
+ if SECRET and secret != SECRET:
739
+ return "", "Unauthorized"
740
+
741
+ if not ref_audio_path or not Path(ref_audio_path).exists():
742
+ return "", "Reference audio missing or not uploaded"
743
+
744
+ try:
745
+ setup()
746
+ except Exception as e:
747
+ return "", f"Setup failed: {e}"
748
+
749
+ cleaned = clean_text(text)
750
+ chunks = split_sentences(cleaned)
751
+ workdir = Path(tempfile.mkdtemp(prefix="chronis_tts_"))
752
+
753
+ try:
754
+ clean_ref = prepare_ref_audio(ref_audio_path)
755
+ combined = AudioSegment.empty()
756
+
757
+ for i, chunk in enumerate(chunks):
758
+ print(f"[synth] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}", flush=True)
759
+ out = run_chunk(chunk, clean_ref, workdir, i)
760
+ combined += AudioSegment.from_wav(out)
761
+ gc.collect()
762
+
763
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
764
+ combined.export(tmp.name, format="wav")
765
+
766
+ with open(tmp.name, "rb") as f:
767
+ audio_b64 = base64.b64encode(f.read()).decode()
768
+
769
+ os.unlink(tmp.name)
770
+ return audio_b64, "ok"
771
+
772
+ except Exception as e:
773
+ print(f"[synth] ERROR: {e}", flush=True)
774
+ return "", str(e)
775
+
776
+ finally:
777
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
778
  shutil.rmtree(workdir, ignore_errors=True)
779
 
780
 
781
  # ──────────────────────────────────────────────────────────────────────────────
782
+ <<<<<<< HEAD
783
  # Gradio UI (same contract as the Fish Speech version)
784
  # ──────────────────────────────────────────────────────────────────────────────
785
 
 
803
  demo.queue()
804
  demo.launch(server_name="0.0.0.0", server_port=7860)
805
 
806
+ =======
807
+ # Gradio UI
808
+ # ──────────────────────────────────────────────────────────────────────────────
809
+
810
+ demo = gr.Interface(
811
+ fn = synthesize,
812
+ inputs = [
813
+ gr.Textbox(label="Text to synthesise"),
814
+ gr.Audio(type="filepath", label="Reference Voice (3–8 second voice note)"),
815
+ gr.Textbox(label="Secret", type="password"),
816
+ ],
817
+ outputs = [
818
+ gr.Textbox(label="Audio Base64"),
819
+ gr.Textbox(label="Status"),
820
+ ],
821
+ api_name = "predict",
822
+ title = "Chronis Fish Speech",
823
+ description = "Voice cloning TTS β€” send a voice note, get the cloned voice back.",
824
+ flagging_mode = "never",
825
+ )
826
+
827
+ demo.queue()
828
+ demo.launch(server_name="0.0.0.0", server_port=7860)
829
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
requirements.txt CHANGED
@@ -1,9 +1,12 @@
 
1
  TTS>=0.22.0
2
  torch
3
  torchaudio
4
  soundfile
5
  librosa
6
  tqdm
 
 
7
  --extra-index-url https://download.pytorch.org/whl/cpu
8
  gradio==5.23.0
9
  torch==2.1.0+cpu
 
1
+ <<<<<<< HEAD
2
  TTS>=0.22.0
3
  torch
4
  torchaudio
5
  soundfile
6
  librosa
7
  tqdm
8
+ =======
9
+ >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
10
  --extra-index-url https://download.pytorch.org/whl/cpu
11
  gradio==5.23.0
12
  torch==2.1.0+cpu