podify / scripts /build_voice_samples.py
jayaspjacob
Add playable voice samples + cap episode length at 2 min
310ca13
Raw
History Blame Contribute Delete
3.61 kB
"""Build voice sample / reference clips from CMU ARCTIC.
fish-speech / OpenAudio ship no reference audio, so we source short, permissively
licensed clips from CMU ARCTIC (festvox.org β€” "distributed with no restrictions for
academic, research and commercial use"). Each voice clip concatenates a few ARCTIC
sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini.
Outputs:
tts/voices/samples/<voice>.wav + .txt β€” played on the Voices page
tts/voices/<preset>.wav + .txt β€” picked up by tts/voices.py PRESETS so the
podcast hosts actually sound distinct
Run from the repo root: python scripts/build_voice_samples.py
WAVs are tracked via Git LFS (see .gitattributes).
"""
from __future__ import annotations
import io
import os
import shutil
import urllib.request
import numpy as np
import soundfile as sf
BASE = "http://festvox.org/cmu_arctic/cmu_arctic"
UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)] # 6 sentences β‰ˆ 20s
GAP_S = 0.15
# Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each).
VOICES = {
"nova": "slt", # female, US
"atlas": "awb", # male, Scottish
"echo": "bdl", # male, US
"sage": "jmk", # male, Canadian
"vivi": "clb", # female, US
"onyx": "rms", # male, US
"rune": "ksp", # male, Indian
}
# Functional presets in tts/voices.py -> reuse a display clip as the cloning reference.
PRESETS = {
"narrator_warm": "nova",
"host_energetic": "echo",
"guest_calm": "vivi",
}
SAMPLES_DIR = os.path.join("tts", "voices", "samples")
PRESETS_DIR = os.path.join("tts", "voices")
def _fetch(url: str) -> bytes:
with urllib.request.urlopen(url, timeout=60) as r:
return r.read()
def _transcripts(spk: str) -> dict:
raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore")
out = {}
for line in raw.splitlines():
line = line.strip()
if not line.startswith("(") or '"' not in line:
continue
try:
uid = line.split()[1]
out[uid] = line[line.index('"') + 1 : line.rindex('"')]
except Exception:
continue
return out
def build():
os.makedirs(SAMPLES_DIR, exist_ok=True)
for vid, spk in VOICES.items():
txts = _transcripts(spk)
chunks, texts, sr = [], [], 16000
for uid in UTTS:
audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")),
dtype="float32")
if audio.ndim > 1:
audio = audio.mean(axis=1)
chunks.append(audio)
chunks.append(np.zeros(int(sr * GAP_S), dtype="float32"))
if uid in txts:
texts.append(txts[uid])
full = np.concatenate(chunks)
peak = float(np.abs(full).max()) or 1.0
full = (full / peak * 0.95).astype("float32")
sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr)
with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f:
f.write(" ".join(texts))
print(f" sample {vid:6s} <- {spk} {len(full)/sr:4.1f}s")
for pid, src in PRESETS.items():
shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"),
os.path.join(PRESETS_DIR, f"{pid}.wav"))
shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"),
os.path.join(PRESETS_DIR, f"{pid}.txt"))
print(f" preset {pid:15s} <- {src}")
if __name__ == "__main__":
build()
print("done.")