Spaces:
Running on Zero
Running on Zero
| """Build voice sample / reference clips from CMU ARCTIC. | |
| fish-speech / OpenAudio ship no reference audio, so we source short, permissively | |
| licensed clips from CMU ARCTIC (festvox.org β "distributed with no restrictions for | |
| academic, research and commercial use"). Each voice clip concatenates a few ARCTIC | |
| sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini. | |
| Outputs: | |
| tts/voices/samples/<voice>.wav + .txt β played on the Voices page | |
| tts/voices/<preset>.wav + .txt β picked up by tts/voices.py PRESETS so the | |
| podcast hosts actually sound distinct | |
| Run from the repo root: python scripts/build_voice_samples.py | |
| WAVs are tracked via Git LFS (see .gitattributes). | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import os | |
| import shutil | |
| import urllib.request | |
| import numpy as np | |
| import soundfile as sf | |
| BASE = "http://festvox.org/cmu_arctic/cmu_arctic" | |
| UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)] # 6 sentences β 20s | |
| GAP_S = 0.15 | |
| # Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each). | |
| VOICES = { | |
| "nova": "slt", # female, US | |
| "atlas": "awb", # male, Scottish | |
| "echo": "bdl", # male, US | |
| "sage": "jmk", # male, Canadian | |
| "vivi": "clb", # female, US | |
| "onyx": "rms", # male, US | |
| "rune": "ksp", # male, Indian | |
| } | |
| # Functional presets in tts/voices.py -> reuse a display clip as the cloning reference. | |
| PRESETS = { | |
| "narrator_warm": "nova", | |
| "host_energetic": "echo", | |
| "guest_calm": "vivi", | |
| } | |
| SAMPLES_DIR = os.path.join("tts", "voices", "samples") | |
| PRESETS_DIR = os.path.join("tts", "voices") | |
| def _fetch(url: str) -> bytes: | |
| with urllib.request.urlopen(url, timeout=60) as r: | |
| return r.read() | |
| def _transcripts(spk: str) -> dict: | |
| raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore") | |
| out = {} | |
| for line in raw.splitlines(): | |
| line = line.strip() | |
| if not line.startswith("(") or '"' not in line: | |
| continue | |
| try: | |
| uid = line.split()[1] | |
| out[uid] = line[line.index('"') + 1 : line.rindex('"')] | |
| except Exception: | |
| continue | |
| return out | |
| def build(): | |
| os.makedirs(SAMPLES_DIR, exist_ok=True) | |
| for vid, spk in VOICES.items(): | |
| txts = _transcripts(spk) | |
| chunks, texts, sr = [], [], 16000 | |
| for uid in UTTS: | |
| audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")), | |
| dtype="float32") | |
| if audio.ndim > 1: | |
| audio = audio.mean(axis=1) | |
| chunks.append(audio) | |
| chunks.append(np.zeros(int(sr * GAP_S), dtype="float32")) | |
| if uid in txts: | |
| texts.append(txts[uid]) | |
| full = np.concatenate(chunks) | |
| peak = float(np.abs(full).max()) or 1.0 | |
| full = (full / peak * 0.95).astype("float32") | |
| sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr) | |
| with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f: | |
| f.write(" ".join(texts)) | |
| print(f" sample {vid:6s} <- {spk} {len(full)/sr:4.1f}s") | |
| for pid, src in PRESETS.items(): | |
| shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"), | |
| os.path.join(PRESETS_DIR, f"{pid}.wav")) | |
| shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"), | |
| os.path.join(PRESETS_DIR, f"{pid}.txt")) | |
| print(f" preset {pid:15s} <- {src}") | |
| if __name__ == "__main__": | |
| build() | |
| print("done.") | |