RiishabhSinghal commited on
Commit
51e47e1
Β·
1 Parent(s): e082b9f

Resolve app.py merge conflicts

Browse files
Files changed (1) hide show
  1. app.py +14 -688
app.py CHANGED
@@ -1,9 +1,4 @@
1
  import os
2
- <<<<<<< HEAD
3
- os.environ["PYTHONUTF8"] = "1"
4
- os.environ["PYTHONIOENCODING"] = "utf-8"
5
- =======
6
- <<<<<<< HEAD
7
 
8
  os.environ["PYTHONUTF8"] = "1"
9
  os.environ["PYTHONIOENCODING"] = "utf-8"
@@ -14,20 +9,10 @@ sys.stderr.reconfigure(encoding="utf-8")
14
 
15
  import re
16
  import gc
17
- =======
18
- >>>>>>> main
19
- import sys
20
- sys.stdout.reconfigure(encoding='utf-8')
21
- sys.stderr.reconfigure(encoding='utf-8')
22
- import re
23
- import gc
24
- import base64
25
- >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
26
  import tempfile
27
  import subprocess
28
  import shutil
29
  import threading
30
- <<<<<<< HEAD
31
  from pathlib import Path
32
 
33
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -46,33 +31,22 @@ _tts_instance = None # lazy-loaded TTS object
46
  print("=== Chronis XTTS-v2 Space Booting ===", flush=True)
47
 
48
 
49
- # ──────────────────────────────────────────────────────────────────────────────
50
- # Setup β€” install TTS library and download XTTS-v2 weights on first run
51
- # ──────────────────────────────────────────────────────────────────────────────
52
-
53
  def setup():
54
- """
55
- Installs the Coqui TTS library if absent, then downloads XTTS-v2 weights
56
- to MODEL_DIR (skipped when weights are already present).
57
- """
58
- # 1. Make sure the TTS package is available
59
  try:
60
  import TTS # noqa: F401
61
  print("[setup] TTS library already installed.", flush=True)
62
  except ImportError:
63
  print("[setup] Installing TTS library ...", flush=True)
64
- subprocess.run(
65
- [sys.executable, "-m", "pip", "install", "TTS", "-q"],
66
- check=True,
67
- )
68
  print("[setup] TTS library installed.", flush=True)
69
 
70
- # 2. Pre-download XTTS-v2 weights so first inference isn't cold
71
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
72
  config_path = MODEL_DIR / "config.json"
73
  if not config_path.exists():
74
  print("[setup] Downloading XTTS-v2 weights ...", flush=True)
75
  from huggingface_hub import snapshot_download
 
76
  snapshot_download(
77
  repo_id="coqui/XTTS-v2",
78
  local_dir=str(MODEL_DIR),
@@ -84,415 +58,21 @@ def setup():
84
 
85
 
86
  def get_tts():
87
- """
88
- Lazy-load the TTS model. Reuses the same instance across calls so the
89
- ~1.8 GB model is only loaded into memory once per process.
90
- """
91
  global _tts_instance
92
  if _tts_instance is None:
93
  from TTS.api import TTS
 
94
  print("[tts] Loading XTTS-v2 model ...", flush=True)
95
  _tts_instance = TTS(
96
  model_path=str(MODEL_DIR),
97
  config_path=str(MODEL_DIR / "config.json"),
98
  progress_bar=False,
99
- gpu=False, # CPU-only; set True if CUDA available
100
  )
101
- print("[tts] Model loaded βœ“", flush=True)
102
  return _tts_instance
103
- =======
104
-
105
- try:
106
- import tomllib
107
- except ModuleNotFoundError:
108
- try:
109
- import tomli as tomllib
110
- except ModuleNotFoundError:
111
- tomllib = None
112
-
113
- try:
114
- import tomli_w
115
- except ModuleNotFoundError:
116
- tomli_w = None
117
-
118
- from pathlib import Path
119
-
120
- os.environ["GRADIO_SSR_MODE"] = "0"
121
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
122
- os.environ["OMP_NUM_THREADS"] = str(os.cpu_count() or 1)
123
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
124
-
125
- import gradio as gr
126
- from pydub import AudioSegment
127
- from huggingface_hub import snapshot_download
128
-
129
- SECRET = os.environ.get("API_SECRET", "")
130
- REPO_DIR = Path(os.environ.get("FISH_REPO_DIR", r"C:\tmp\fish-speech"))
131
- MODEL_DIR = Path(os.environ.get("FISH_MODEL_DIR", r"C:\tmp\fish-speech-weights"))
132
-
133
- inference_lock = threading.Lock()
134
- initialized = False
135
-
136
- print("=== Chronis Fish Speech Space Booting ===", flush=True)
137
-
138
-
139
- # ──────────────────────────────────────────────────────────────────────────────
140
- # Patch 1 β€” LogMelSpectrogram
141
- #
142
- # History of bugs fixed in this class:
143
- #
144
- # Round 1 β€” AttributeError: 'LogMelSpectrogram' has no attribute 'hop_length'
145
- # firefly.py reads self.spec_transform.hop_length (and n_mels, n_fft, etc.)
146
- # directly on the object. They were only stored inside self._transform.
147
- # Fix: expose every __init__ param as a top-level self.* attribute.
148
- #
149
- # Round 2 (current) β€” RuntimeError: size of tensor a (1292) must match b (160)
150
- # at non-singleton dimension 3
151
- #
152
- # Root cause A β€” wrong input shape -> 4-D output:
153
- # vqgan/inference.py loads audio with torchaudio.load() -> (C, T),
154
- # then passes it as (1, C, T) = (1, 1, T) to model.encode().
155
- # firefly.encode() calls self.spec_transform(audios) with a 3-D tensor.
156
- # T.MelSpectrogram treats every dim except the last as a batch dim,
157
- # so (B=1, C=1, T) -> output (B=1, C=1, n_mels, T_frames) [4-D].
158
- # Downstream masks are computed as 3-D (B, 1, T_vq).
159
- # PyTorch broadcasting aligns from the right:
160
- # mels: (1, 1, 160, 1292) dim-3 = 1292
161
- # mel_masks_conv: (1, 1, 1, 160) dim-3 = 160
162
- # -> "size of tensor a (1292) must match b (160) at non-singleton dim 3"
163
- # Fix: squeeze the channel dim inside forward() so output is always 3-D.
164
- #
165
- # Root cause B β€” wrong default hyperparameters:
166
- # The "21hz" in firefly-gan-vq-fsq-8x1024-21hz encodes the token rate:
167
- # 44100 / (hop_length Γ— 8_conv_strides) β‰ˆ 21 -> hop_length = 256
168
- # n_mels is 160 for fish-speech, not 128.
169
- # Hydra injects the correct values via __init__ kwargs, but using the
170
- # right defaults prevents silent fallback failures.
171
- # ──────────────────────────────────────────────────────────────────────────────
172
- SPECTROGRAM_SRC = '''\
173
- """
174
- fish_speech.utils.spectrogram β€” patched by Chronis setup.
175
- See app.py Patch 1 comment block for the full explanation of fixes.
176
- """
177
- import torch
178
- import torch.nn as nn
179
- import torchaudio.transforms as T
180
-
181
-
182
- class LogMelSpectrogram(nn.Module):
183
- def __init__(
184
- self,
185
- sample_rate: int = 44100,
186
- n_fft: int = 1024,
187
- hop_length: int = 256,
188
- win_length: int = 1024,
189
- n_mels: int = 160,
190
- f_min: float = 0.0,
191
- f_max: float = None,
192
- center: bool = True,
193
- power: float = 1.0,
194
- norm: str = None,
195
- mel_scale: str = "slaney",
196
- clamp_min: float = 1e-5,
197
- ):
198
- super().__init__()
199
-
200
- # Every param must be a direct instance attribute.
201
- # firefly.py reads them as self.spec_transform.<attr>.
202
- self.sample_rate = sample_rate
203
- self.n_fft = n_fft
204
- self.hop_length = hop_length
205
- self.win_length = win_length
206
- self.n_mels = n_mels
207
- self.f_min = f_min
208
- self.f_max = f_max if f_max is not None else float(sample_rate) / 2.0
209
- self.clamp_min = clamp_min
210
-
211
- self._transform = T.MelSpectrogram(
212
- sample_rate = sample_rate,
213
- n_fft = n_fft,
214
- hop_length = hop_length,
215
- win_length = win_length,
216
- n_mels = n_mels,
217
- f_min = f_min,
218
- f_max = self.f_max,
219
- center = center,
220
- power = power,
221
- norm = norm,
222
- mel_scale = mel_scale,
223
- )
224
-
225
- def forward(self, x: torch.Tensor) -> torch.Tensor:
226
- """
227
- x : (B, T) | (T,) | (B, 1, T) | (B, C, T)
228
- out : (B, n_mels, T_frames) β€” always 3-D, never 4-D
229
-
230
- The channel-squeeze is critical. vqgan/inference.py passes audio as
231
- (B=1, C=1, T); without the squeeze T.MelSpectrogram returns a 4-D
232
- tensor which mismatches the 3-D conv mask, crashing at dim 3.
233
- """
234
- if x.ndim == 3:
235
- if x.shape[1] == 1:
236
- x = x.squeeze(1) # mono (B, 1, T) -> (B, T)
237
- else:
238
- x = x.mean(dim=1) # stereo (B, C, T) -> (B, T)
239
- mel = self._transform(x)
240
- return torch.log(torch.clamp(mel, min=self.clamp_min))
241
- '''
242
-
243
-
244
- def _patch_spectrogram_module():
245
- utils_dir = REPO_DIR / "fish_speech" / "utils"
246
- utils_dir.mkdir(parents=True, exist_ok=True)
247
-
248
- init_file = utils_dir / "__init__.py"
249
- if not init_file.exists():
250
- init_file.write_text("# auto-generated by Chronis setup\n")
251
-
252
- spec_file = utils_dir / "spectrogram.py"
253
- spec_file.write_text(SPECTROGRAM_SRC)
254
-
255
- # Delete any stale .pyc that could shadow the updated .py
256
- pyc_dir = utils_dir / "__pycache__"
257
- if pyc_dir.exists():
258
- for pyc in pyc_dir.glob("spectrogram*.pyc"):
259
- pyc.unlink()
260
- print(f"[patch] deleted stale {pyc}", flush=True)
261
-
262
- print(f"[patch] wrote {spec_file}", flush=True)
263
-
264
-
265
- # ──────────────────────────────────────────────────────────────────────────────
266
- # Patch 2 β€” strip pyaudio from all dependency manifests
267
- # ──────────────────────────────────────────────────────────────────────────────
268
- def _drop_dep(dep_list: list, pattern: str) -> list:
269
- return [d for d in dep_list if not d.lower().startswith(pattern)]
270
-
271
-
272
- def _patch_pyproject_toml():
273
- pyproject = REPO_DIR / "pyproject.toml"
274
- if not pyproject.exists():
275
- return
276
-
277
- with open(pyproject, "rb") as f:
278
- data = tomllib.load(f)
279
-
280
- changed = False
281
- deps = data.get("project", {}).get("dependencies", [])
282
- if deps:
283
- new_deps = _drop_dep(deps, "pyaudio")
284
- if new_deps != deps:
285
- data["project"]["dependencies"] = new_deps
286
- changed = True
287
-
288
- poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {})
289
- if "pyaudio" in poetry_deps or "PyAudio" in poetry_deps:
290
- poetry_deps.pop("pyaudio", None)
291
- poetry_deps.pop("PyAudio", None)
292
- changed = True
293
-
294
- if changed:
295
- with open(pyproject, "wb") as f:
296
- tomli_w.dump(data, f)
297
- print("[patch] removed pyaudio from pyproject.toml", flush=True)
298
-
299
-
300
- def _patch_requirements_txt():
301
- for fname in ("requirements.txt", "requirements-base.txt"):
302
- req = REPO_DIR / fname
303
- if not req.exists():
304
- continue
305
- lines = req.read_text().splitlines()
306
- new_lines = [l for l in lines if not l.lower().startswith("pyaudio")]
307
- if new_lines != lines:
308
- req.write_text("\n".join(new_lines) + "\n")
309
- print(f"[patch] removed pyaudio from {fname}", flush=True)
310
-
311
-
312
- def _patch_setup_cfg():
313
- setup_cfg = REPO_DIR / "setup.cfg"
314
- if not setup_cfg.exists():
315
- return
316
- text = setup_cfg.read_text()
317
- new_text = "\n".join(
318
- l for l in text.splitlines() if not l.strip().lower().startswith("pyaudio")
319
- )
320
- if new_text != text:
321
- setup_cfg.write_text(new_text)
322
- print("[patch] removed pyaudio from setup.cfg", flush=True)
323
-
324
-
325
- def _patch_dependencies():
326
- global tomllib, tomli_w
327
- if tomllib is None or tomli_w is None:
328
- subprocess.run(
329
- [sys.executable, "-m", "pip", "install", "tomli", "tomli_w", "-q"],
330
- check=True,
331
- )
332
- import tomli as tomllib
333
- import tomli_w as tomli_w
334
 
335
- _patch_pyproject_toml()
336
- _patch_requirements_txt()
337
- _patch_setup_cfg()
338
-
339
-
340
- # ──────────────────────────────────────────────────────────────────────────────
341
- # Patch 3 β€” CPU-safe subprocess wrapper
342
- # ──────────────────────────────────────────────────────────────────────────────
343
- WRAPPER_PATH = Path("/tmp/_chronis_torch_cpu.py")
344
-
345
- _WRAPPER_SRC = '''\
346
- """
347
- Chronis CPU-safe subprocess wrapper.
348
- Forces torch.load -> CPU, disables weights_only, redirects .to(cuda) -> .to(cpu).
349
- Usage: python _chronis_torch_cpu.py <real_script.py> [args...]
350
- """
351
- import sys
352
- import torch
353
- import runpy
354
-
355
- _original_load = torch.load
356
-
357
- def _cpu_safe_load(f, map_location=None, pickle_module=None, **kwargs):
358
- kwargs["weights_only"] = False
359
- kwargs["map_location"] = "cpu"
360
- if pickle_module is not None:
361
- kwargs["pickle_module"] = pickle_module
362
- return _original_load(f, **kwargs)
363
-
364
- torch.load = _cpu_safe_load
365
-
366
- _orig_module_to = torch.nn.Module.to
367
- def _cpu_module_to(self, *args, **kwargs):
368
- new_args = []
369
- for a in args:
370
- if isinstance(a, (str, torch.device)) and "cuda" in str(a):
371
- a = torch.device("cpu")
372
- new_args.append(a)
373
- if "device" in kwargs and "cuda" in str(kwargs["device"]):
374
- kwargs["device"] = torch.device("cpu")
375
- return _orig_module_to(self, *new_args, **kwargs)
376
- torch.nn.Module.to = _cpu_module_to
377
-
378
- _orig_tensor_to = torch.Tensor.to
379
- def _cpu_tensor_to(self, *args, **kwargs):
380
- new_args = []
381
- for a in args:
382
- if isinstance(a, (str, torch.device)) and "cuda" in str(a):
383
- a = torch.device("cpu")
384
- new_args.append(a)
385
- if "device" in kwargs and "cuda" in str(kwargs["device"]):
386
- kwargs["device"] = torch.device("cpu")
387
- return _orig_tensor_to(self, *new_args, **kwargs)
388
- torch.Tensor.to = _cpu_tensor_to
389
-
390
- sys.argv = sys.argv[1:]
391
- runpy.run_path(sys.argv[0], run_name="__main__")
392
- '''
393
-
394
-
395
- def _patch_torch_load():
396
- WRAPPER_PATH.write_text(_WRAPPER_SRC)
397
- print(f"[patch] wrote subprocess wrapper -> {WRAPPER_PATH}", flush=True)
398
-
399
-
400
- # ──────────────────────────────────────────────────────────────────────────────
401
-
402
- def _build_env():
403
- existing = os.environ.get("PYTHONPATH", "")
404
- parts = [str(REPO_DIR)]
405
- if existing:
406
- parts.append(existing)
407
- new_pythonpath = os.pathsep.join(parts)
408
-
409
- return {
410
- **os.environ,
411
- "PYTHONPATH": new_pythonpath,
412
- "HYDRA_FULL_ERROR": "1",
413
- "CUDA_VISIBLE_DEVICES": "",
414
- "PYTHONUTF8": "1",
415
- }
416
- # Add this to _patch_spectrogram_module() in app.py, replacing the current version:
417
-
418
- def _patch_spectrogram_module():
419
- # Ensure the full package chain exists
420
- for pkg_dir in [
421
- REPO_DIR / "fish_speech",
422
- REPO_DIR / "fish_speech" / "utils",
423
- ]:
424
- pkg_dir.mkdir(parents=True, exist_ok=True)
425
- init_file = pkg_dir / "__init__.py"
426
- if not init_file.exists():
427
- init_file.write_text("# auto-generated\n")
428
- print(f"[patch] created {init_file}", flush=True)
429
-
430
- spec_file = REPO_DIR / "fish_speech" / "utils" / "spectrogram.py"
431
- spec_file.write_text(SPECTROGRAM_SRC)
432
-
433
- # Nuke ALL pycache under fish_speech to prevent stale imports
434
- for pyc_dir in (REPO_DIR / "fish_speech").rglob("__pycache__"):
435
- for f in pyc_dir.iterdir():
436
- f.unlink()
437
- print(f"[patch] cleared {pyc_dir}", flush=True)
438
-
439
- print(f"[patch] wrote {spec_file}", flush=True)
440
-
441
- # ──────────────────────────────────────────────────────────────────────────────
442
- # Setup
443
- # ──────────────────────────────────────────────────────────────────────────────
444
-
445
- def setup():
446
- global initialized
447
- if initialized:
448
- return
449
-
450
- if not REPO_DIR.exists():
451
- print("Cloning Fish Speech v1.5.0 ...", flush=True)
452
- subprocess.run(
453
- [
454
- "git", "clone",
455
- "--depth", "1",
456
- "--branch", "v1.5.0",
457
- "https://github.com/fishaudio/fish-speech.git",
458
- str(REPO_DIR),
459
- ],
460
- check=True,
461
- )
462
-
463
- _patch_spectrogram_module()
464
- _patch_dependencies()
465
- _patch_torch_load()
466
-
467
- # print("Installing Fish Speech (editable) ...", flush=True)
468
- # subprocess.run(
469
- # [sys.executable, "-m", "pip", "install", "-e", ".", "--quiet"],
470
- # cwd=str(REPO_DIR),
471
- # check=True,
472
- # )
473
-
474
- # Re-apply AFTER pip install β€” editable install can cache stale .pyc files
475
- _patch_spectrogram_module()
476
-
477
- if str(REPO_DIR) not in sys.path:
478
- sys.path.insert(0, str(REPO_DIR))
479
-
480
- if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
481
- print("Downloading Fish Speech 1.5 weights ...", flush=True)
482
- snapshot_download(
483
- repo_id = "fishaudio/fish-speech-1.5",
484
- local_dir = str(MODEL_DIR),
485
- local_dir_use_symlinks = False,
486
- )
487
-
488
- print("Setup complete.", flush=True)
489
- initialized = True
490
- >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
491
-
492
-
493
- # ──────────────────────────────────────────────────────────────────────────────
494
- # Text helpers
495
- # ──────────────────────────────────────────────────────────────────────────────
496
 
497
  def clean_text(text: str) -> str:
498
  text = re.sub(r"[^\x00-\x7F]+", " ", text)
@@ -502,21 +82,10 @@ def clean_text(text: str) -> str:
502
  return text[:500]
503
 
504
 
505
- <<<<<<< HEAD
506
  def split_sentences(text: str, max_chars: int = 200) -> list[str]:
507
- """
508
- XTTS handles longer segments better than Fish Speech, so we use a
509
- generous 200-char chunk limit instead of 120.
510
- """
511
  parts = re.split(r"(?<=[.!?])\s+", text)
512
  chunks: list[str] = []
513
  buf = ""
514
- =======
515
- def split_sentences(text: str, max_chars: int = 120) -> list:
516
- parts = re.split(r"(?<=[.!?])\s+", text)
517
- chunks = []
518
- buf = ""
519
- >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
520
  for p in parts:
521
  if len(buf) + len(p) < max_chars:
522
  buf = (buf + " " + p).strip()
@@ -529,27 +98,15 @@ def split_sentences(text: str, max_chars: int = 120) -> list:
529
  return chunks or [text]
530
 
531
 
532
- # ──────────────────────────────────────────────────────────────────────────────
533
- # Audio helpers
534
- # ──────────────────────────────────────────────────────────────────────────────
535
-
536
  def prepare_ref_audio(ref_path: str) -> str:
537
- """
538
- <<<<<<< HEAD
539
- Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
540
-
541
- XTTS-v2 expects 24 kHz input for its speaker encoder.
542
- Recommended reference length: 6-12 s; we cap at 10 s for CPU speed.
543
- """
544
  audio = AudioSegment.from_file(ref_path)
545
- audio = audio.set_channels(1).set_frame_rate(24_000).normalize()
546
 
547
- if len(audio) > 10_000:
548
- audio = audio[:10_000]
549
- elif len(audio) < 1_000:
550
- raise ValueError(
551
- f"Reference audio too short ({len(audio)} ms). Need at least 1 second."
552
- )
553
 
554
  fd, tmp_path = tempfile.mkstemp(suffix=".wav")
555
  os.close(fd)
@@ -557,20 +114,7 @@ def prepare_ref_audio(ref_path: str) -> str:
557
  return tmp_path
558
 
559
 
560
- # ──────────────────────────────────────────────────────────────────────────────
561
- # Inference
562
- # ──────────────────────────────────────────────────────────────────────────────
563
-
564
  def run_chunk(tts, text: str, ref_audio: str, out_path: str):
565
- """
566
- Synthesise one text chunk and write the result to out_path (WAV).
567
-
568
- XTTS-v2 tts_to_file() signature:
569
- text – the utterance
570
- speaker_wav – reference audio file(s) for voice cloning
571
- language – BCP-47 code; "en" covers most use-cases
572
- file_path – output WAV path
573
- """
574
  tts.tts_to_file(
575
  text=text,
576
  speaker_wav=ref_audio,
@@ -581,20 +125,17 @@ def run_chunk(tts, text: str, ref_audio: str, out_path: str):
581
 
582
  def synthesize(text: str, ref_audio_path: str, secret: str):
583
  with inference_lock:
584
- # ── Auth ──────────────────────────────────────────────────────────────
585
  if SECRET and secret != SECRET:
586
  return None, "Unauthorized"
587
 
588
  if not ref_audio_path or not Path(ref_audio_path).exists():
589
  return None, "Reference audio missing or not uploaded"
590
 
591
- # ── First-run setup ───────────────────────────────────────────────────
592
  try:
593
  setup()
594
  except Exception as e:
595
  return None, f"Setup failed: {e}"
596
 
597
- # ── Synthesis ─────────────────────────────────────────────────────────
598
  cleaned = clean_text(text)
599
  chunks = split_sentences(cleaned)
600
  workdir = Path(tempfile.mkdtemp(prefix="chronis_xtts_"))
@@ -616,15 +157,12 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
616
  fd, tmp_out = tempfile.mkstemp(suffix=".wav")
617
  os.close(fd)
618
  combined.export(tmp_out, format="wav")
619
- # Return file path directly so Gradio renders a playable audio output.
620
  final_audio_path = tmp_out
621
  tmp_out = None
622
  return final_audio_path, "ok"
623
-
624
  except Exception as e:
625
  print(f"[synth] ERROR: {e}", flush=True)
626
  return None, str(e)
627
-
628
  finally:
629
  if clean_ref and Path(clean_ref).exists():
630
  try:
@@ -636,196 +174,9 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
636
  os.unlink(tmp_out)
637
  except OSError:
638
  pass
639
- =======
640
- Normalise to mono 44100 Hz WAV, capped at 8 seconds.
641
-
642
- Fish Speech docs recommend 3-10 s of reference. We cap at 8 s:
643
- - Short enough to keep CPU encode time reasonable
644
- - Long enough for good speaker characterisation
645
- - Avoids edge-case rounding in the conv-mask stride at 15 s lengths
646
- """
647
- audio = AudioSegment.from_file(ref_path)
648
- audio = audio.set_channels(1).set_frame_rate(44100).normalize()
649
-
650
- if len(audio) > 8_000:
651
- audio = audio[:8_000]
652
- elif len(audio) < 1_000:
653
- raise ValueError(
654
- f"Reference audio too short ({len(audio)}ms). Need at least 1 second."
655
- )
656
-
657
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
658
- audio.export(tmp.name, format="wav")
659
- return tmp.name
660
-
661
-
662
- # ──────────────────────────────────────────────────────────────────────────────
663
- # Inference pipeline
664
- # ─────────────────────────────────────────────────────────────────────��────────
665
-
666
- def run_step(cmd: list, name: str, cwd: Path, expect_output: Path = None):
667
- """
668
- Run a Fish Speech subprocess through the CPU wrapper.
669
- Raises a detailed RuntimeError on non-zero exit or missing expected output.
670
- """
671
- print(f"[{name}] starting ...", flush=True)
672
- wrapped_cmd = [cmd[0], str(WRAPPER_PATH)] + cmd[1:]
673
-
674
- result = subprocess.run(
675
- wrapped_cmd,
676
- cwd = str(cwd),
677
- capture_output = True,
678
- text = True,
679
- encoding = "utf-8",
680
- errors = "replace",
681
- env = _build_env(),
682
- timeout = 600,
683
- )
684
-
685
- if result.stdout.strip():
686
- print(f"[{name}] stdout:\n{result.stdout[-1200:]}".encode("utf-8", "replace").decode(), flush=True)
687
-
688
- if result.returncode != 0:
689
- diag = (
690
- f"[{name}] FAILED (exit {result.returncode})\n"
691
- f"--- stderr ---\n{result.stderr[-1500:]}\n"
692
- f"--- stdout ---\n{result.stdout[-600:]}"
693
- )
694
- print(diag, flush=True)
695
- raise RuntimeError(diag)
696
-
697
- if expect_output is not None and not expect_output.exists():
698
- raise RuntimeError(
699
- f"[{name}] exited 0 but expected output missing: {expect_output}\n"
700
- f"stdout: {result.stdout[-800:]}\nstderr: {result.stderr[-800:]}"
701
- )
702
-
703
- print(f"[{name}] done βœ“", flush=True)
704
-
705
-
706
- def run_chunk(text: str, ref_audio: str, workdir: Path, idx: int) -> str:
707
- chunk_dir = workdir / f"chunk_{idx}"
708
- chunk_dir.mkdir(parents=True, exist_ok=True)
709
-
710
- ref_copy = chunk_dir / "ref.wav"
711
- shutil.copy(ref_audio, ref_copy)
712
-
713
- vq_tokens = chunk_dir / "fake.npy"
714
- sem_tokens = chunk_dir / "codes_0.npy"
715
- out_wav = chunk_dir / "fake.wav"
716
-
717
- # In fish-speech v1.5, tools/vqgan/inference.py handles BOTH encode and
718
- # decode. Mode is auto-detected from the input file extension:
719
- # .wav -> encode -> writes fake.npy
720
- # .npy -> decode -> writes fake.wav
721
- vqgan_script = str(REPO_DIR / "tools" / "vqgan" / "inference.py")
722
- t2s_script = str(REPO_DIR / "fish_speech" / "models" / "text2semantic" / "inference.py")
723
- firefly_ckpt = str(MODEL_DIR / "firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
724
-
725
- # Step 1: Reference audio -> VQ tokens
726
- run_step(
727
- [
728
- sys.executable, vqgan_script,
729
- "-i", str(ref_copy),
730
- "--checkpoint-path", firefly_ckpt,
731
- "--device", "cpu",
732
- ],
733
- name = "Codec Encode",
734
- cwd = chunk_dir,
735
- expect_output = vq_tokens,
736
- )
737
-
738
- # Step 2: Text + VQ tokens -> semantic codes
739
- run_step(
740
- [
741
- sys.executable, t2s_script,
742
- "--text", text,
743
- "--prompt-tokens", str(vq_tokens),
744
- "--checkpoint-path", str(MODEL_DIR),
745
- "--num-samples", "1",
746
- "--device", "cpu",
747
- ],
748
- name = "Text2Semantic",
749
- cwd = chunk_dir,
750
- expect_output = sem_tokens,
751
- )
752
-
753
- # Step 3: Semantic codes -> audio
754
- run_step(
755
- [
756
- sys.executable, vqgan_script,
757
- "-i", str(sem_tokens),
758
- "--checkpoint-path", firefly_ckpt,
759
- "--device", "cpu",
760
- ],
761
- name = "Codec Decode",
762
- cwd = chunk_dir,
763
- expect_output = out_wav,
764
- )
765
-
766
- return str(out_wav)
767
-
768
-
769
- # ──────────────────────────────────────────────────────────────────────────────
770
- # Main synthesis entry point
771
- # ──────────────────────────────────────────────────────────────────────────────
772
-
773
- def synthesize(text: str, ref_audio_path: str, secret: str):
774
- with inference_lock:
775
- if SECRET and secret != SECRET:
776
- return "", "Unauthorized"
777
-
778
- if not ref_audio_path or not Path(ref_audio_path).exists():
779
- return "", "Reference audio missing or not uploaded"
780
-
781
- try:
782
- # Check if the model directory already has files in it
783
- if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
784
- print("[synth] Running first-time setup...", flush=True)
785
- setup()
786
- else:
787
- # This skips the 'pip install' that causes the Access Denied error
788
- print("[synth] Skipping setup: Model weights already present.", flush=True)
789
- except Exception as e:
790
- return "", f"Setup failed: {e}"
791
-
792
- cleaned = clean_text(text)
793
- chunks = split_sentences(cleaned)
794
- workdir = Path(tempfile.mkdtemp(prefix="chronis_tts_"))
795
-
796
- try:
797
- clean_ref = prepare_ref_audio(ref_audio_path)
798
- combined = AudioSegment.empty()
799
-
800
- for i, chunk in enumerate(chunks):
801
- print(f"[synth] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}", flush=True)
802
- out = run_chunk(chunk, clean_ref, workdir, i)
803
- combined += AudioSegment.from_wav(out)
804
- gc.collect()
805
-
806
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
807
- combined.export(tmp.name, format="wav")
808
-
809
- with open(tmp.name, "rb") as f:
810
- audio_b64 = base64.b64encode(f.read()).decode()
811
-
812
- os.unlink(tmp.name)
813
- return audio_b64, "ok"
814
-
815
- except Exception as e:
816
- print(f"[synth] ERROR: {e}", flush=True)
817
- return "", str(e)
818
-
819
- finally:
820
- >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
821
  shutil.rmtree(workdir, ignore_errors=True)
822
 
823
 
824
- # ──────────────────────────────────────────────────────────────────────────────
825
- <<<<<<< HEAD
826
- # Gradio UI (same contract as the Fish Speech version)
827
- # ──────────────────────────────────────────────────────────────────────────────
828
-
829
  demo = gr.Interface(
830
  fn=synthesize,
831
  inputs=[
@@ -839,34 +190,9 @@ demo = gr.Interface(
839
  ],
840
  api_name="predict",
841
  title="Chronis XTTS-v2",
842
- description="Voice cloning TTS β€” send a voice note, get the cloned voice back.",
843
  flagging_mode="never",
844
  )
845
 
846
  demo.queue()
847
  demo.launch(server_name="0.0.0.0", server_port=7860)
848
-
849
- =======
850
- # Gradio UI
851
- # ──────────────────────────────────────────────────────────────────────────────
852
-
853
- demo = gr.Interface(
854
- fn = synthesize,
855
- inputs = [
856
- gr.Textbox(label="Text to synthesise"),
857
- gr.Audio(type="filepath", label="Reference Voice (3-8 second voice note)"),
858
- gr.Textbox(label="Secret", type="password"),
859
- ],
860
- outputs = [
861
- gr.Textbox(label="Audio Base64"),
862
- gr.Textbox(label="Status"),
863
- ],
864
- api_name = "predict",
865
- title = "Chronis Fish Speech",
866
- description = "Voice cloning TTS - send a voice note, get the cloned voice back.",
867
- flagging_mode = "never",
868
- )
869
-
870
- demo.queue()
871
- demo.launch(server_name="0.0.0.0", server_port=7860)
872
- >>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
 
1
  import os
 
 
 
 
 
2
 
3
  os.environ["PYTHONUTF8"] = "1"
4
  os.environ["PYTHONIOENCODING"] = "utf-8"
 
9
 
10
  import re
11
  import gc
 
 
 
 
 
 
 
 
 
12
  import tempfile
13
  import subprocess
14
  import shutil
15
  import threading
 
16
  from pathlib import Path
17
 
18
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
31
  print("=== Chronis XTTS-v2 Space Booting ===", flush=True)
32
 
33
 
 
 
 
 
34
  def setup():
35
+ """Install Coqui TTS if needed and download XTTS-v2 weights once."""
 
 
 
 
36
  try:
37
  import TTS # noqa: F401
38
  print("[setup] TTS library already installed.", flush=True)
39
  except ImportError:
40
  print("[setup] Installing TTS library ...", flush=True)
41
+ subprocess.run([sys.executable, "-m", "pip", "install", "TTS", "-q"], check=True)
 
 
 
42
  print("[setup] TTS library installed.", flush=True)
43
 
 
44
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
45
  config_path = MODEL_DIR / "config.json"
46
  if not config_path.exists():
47
  print("[setup] Downloading XTTS-v2 weights ...", flush=True)
48
  from huggingface_hub import snapshot_download
49
+
50
  snapshot_download(
51
  repo_id="coqui/XTTS-v2",
52
  local_dir=str(MODEL_DIR),
 
58
 
59
 
60
  def get_tts():
61
+ """Lazy-load model once per process."""
 
 
 
62
  global _tts_instance
63
  if _tts_instance is None:
64
  from TTS.api import TTS
65
+
66
  print("[tts] Loading XTTS-v2 model ...", flush=True)
67
  _tts_instance = TTS(
68
  model_path=str(MODEL_DIR),
69
  config_path=str(MODEL_DIR / "config.json"),
70
  progress_bar=False,
71
+ gpu=False,
72
  )
73
+ print("[tts] Model loaded", flush=True)
74
  return _tts_instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def clean_text(text: str) -> str:
78
  text = re.sub(r"[^\x00-\x7F]+", " ", text)
 
82
  return text[:500]
83
 
84
 
 
85
  def split_sentences(text: str, max_chars: int = 200) -> list[str]:
 
 
 
 
86
  parts = re.split(r"(?<=[.!?])\s+", text)
87
  chunks: list[str] = []
88
  buf = ""
 
 
 
 
 
 
89
  for p in parts:
90
  if len(buf) + len(p) < max_chars:
91
  buf = (buf + " " + p).strip()
 
98
  return chunks or [text]
99
 
100
 
 
 
 
 
101
  def prepare_ref_audio(ref_path: str) -> str:
102
+ """Normalize to mono 24k WAV and cap to 10 seconds."""
 
 
 
 
 
 
103
  audio = AudioSegment.from_file(ref_path)
104
+ audio = audio.set_channels(1).set_frame_rate(24000).normalize()
105
 
106
+ if len(audio) > 10000:
107
+ audio = audio[:10000]
108
+ elif len(audio) < 1000:
109
+ raise ValueError(f"Reference audio too short ({len(audio)} ms). Need at least 1 second.")
 
 
110
 
111
  fd, tmp_path = tempfile.mkstemp(suffix=".wav")
112
  os.close(fd)
 
114
  return tmp_path
115
 
116
 
 
 
 
 
117
  def run_chunk(tts, text: str, ref_audio: str, out_path: str):
 
 
 
 
 
 
 
 
 
118
  tts.tts_to_file(
119
  text=text,
120
  speaker_wav=ref_audio,
 
125
 
126
  def synthesize(text: str, ref_audio_path: str, secret: str):
127
  with inference_lock:
 
128
  if SECRET and secret != SECRET:
129
  return None, "Unauthorized"
130
 
131
  if not ref_audio_path or not Path(ref_audio_path).exists():
132
  return None, "Reference audio missing or not uploaded"
133
 
 
134
  try:
135
  setup()
136
  except Exception as e:
137
  return None, f"Setup failed: {e}"
138
 
 
139
  cleaned = clean_text(text)
140
  chunks = split_sentences(cleaned)
141
  workdir = Path(tempfile.mkdtemp(prefix="chronis_xtts_"))
 
157
  fd, tmp_out = tempfile.mkstemp(suffix=".wav")
158
  os.close(fd)
159
  combined.export(tmp_out, format="wav")
 
160
  final_audio_path = tmp_out
161
  tmp_out = None
162
  return final_audio_path, "ok"
 
163
  except Exception as e:
164
  print(f"[synth] ERROR: {e}", flush=True)
165
  return None, str(e)
 
166
  finally:
167
  if clean_ref and Path(clean_ref).exists():
168
  try:
 
174
  os.unlink(tmp_out)
175
  except OSError:
176
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  shutil.rmtree(workdir, ignore_errors=True)
178
 
179
 
 
 
 
 
 
180
  demo = gr.Interface(
181
  fn=synthesize,
182
  inputs=[
 
190
  ],
191
  api_name="predict",
192
  title="Chronis XTTS-v2",
193
+ description="Voice cloning TTS - send a voice note, get the cloned voice back.",
194
  flagging_mode="never",
195
  )
196
 
197
  demo.queue()
198
  demo.launch(server_name="0.0.0.0", server_port=7860)