"""Build voice sample / reference clips from CMU ARCTIC. fish-speech / OpenAudio ship no reference audio, so we source short, permissively licensed clips from CMU ARCTIC (festvox.org — "distributed with no restrictions for academic, research and commercial use"). Each voice clip concatenates a few ARCTIC sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini. Outputs: tts/voices/samples/.wav + .txt — played on the Voices page tts/voices/.wav + .txt — picked up by tts/voices.py PRESETS so the podcast hosts actually sound distinct Run from the repo root: python scripts/build_voice_samples.py WAVs are tracked via Git LFS (see .gitattributes). """ from __future__ import annotations import io import os import shutil import urllib.request import numpy as np import soundfile as sf BASE = "http://festvox.org/cmu_arctic/cmu_arctic" UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)] # 6 sentences ≈ 20s GAP_S = 0.15 # Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each). VOICES = { "nova": "slt", # female, US "atlas": "awb", # male, Scottish "echo": "bdl", # male, US "sage": "jmk", # male, Canadian "vivi": "clb", # female, US "onyx": "rms", # male, US "rune": "ksp", # male, Indian } # Functional presets in tts/voices.py -> reuse a display clip as the cloning reference. PRESETS = { "narrator_warm": "nova", "host_energetic": "echo", "guest_calm": "vivi", } SAMPLES_DIR = os.path.join("tts", "voices", "samples") PRESETS_DIR = os.path.join("tts", "voices") def _fetch(url: str) -> bytes: with urllib.request.urlopen(url, timeout=60) as r: return r.read() def _transcripts(spk: str) -> dict: raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore") out = {} for line in raw.splitlines(): line = line.strip() if not line.startswith("(") or '"' not in line: continue try: uid = line.split()[1] out[uid] = line[line.index('"') + 1 : line.rindex('"')] except Exception: continue return out def build(): os.makedirs(SAMPLES_DIR, exist_ok=True) for vid, spk in VOICES.items(): txts = _transcripts(spk) chunks, texts, sr = [], [], 16000 for uid in UTTS: audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")), dtype="float32") if audio.ndim > 1: audio = audio.mean(axis=1) chunks.append(audio) chunks.append(np.zeros(int(sr * GAP_S), dtype="float32")) if uid in txts: texts.append(txts[uid]) full = np.concatenate(chunks) peak = float(np.abs(full).max()) or 1.0 full = (full / peak * 0.95).astype("float32") sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr) with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f: f.write(" ".join(texts)) print(f" sample {vid:6s} <- {spk} {len(full)/sr:4.1f}s") for pid, src in PRESETS.items(): shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"), os.path.join(PRESETS_DIR, f"{pid}.wav")) shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"), os.path.join(PRESETS_DIR, f"{pid}.txt")) print(f" preset {pid:15s} <- {src}") if __name__ == "__main__": build() print("done.")