Spaces:
Running on Zero
Running on Zero
File size: 3,612 Bytes
310ca13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | """Build voice sample / reference clips from CMU ARCTIC.
fish-speech / OpenAudio ship no reference audio, so we source short, permissively
licensed clips from CMU ARCTIC (festvox.org — "distributed with no restrictions for
academic, research and commercial use"). Each voice clip concatenates a few ARCTIC
sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini.
Outputs:
tts/voices/samples/<voice>.wav + .txt — played on the Voices page
tts/voices/<preset>.wav + .txt — picked up by tts/voices.py PRESETS so the
podcast hosts actually sound distinct
Run from the repo root: python scripts/build_voice_samples.py
WAVs are tracked via Git LFS (see .gitattributes).
"""
from __future__ import annotations
import io
import os
import shutil
import urllib.request
import numpy as np
import soundfile as sf
BASE = "http://festvox.org/cmu_arctic/cmu_arctic"
UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)] # 6 sentences ≈ 20s
GAP_S = 0.15
# Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each).
VOICES = {
"nova": "slt", # female, US
"atlas": "awb", # male, Scottish
"echo": "bdl", # male, US
"sage": "jmk", # male, Canadian
"vivi": "clb", # female, US
"onyx": "rms", # male, US
"rune": "ksp", # male, Indian
}
# Functional presets in tts/voices.py -> reuse a display clip as the cloning reference.
PRESETS = {
"narrator_warm": "nova",
"host_energetic": "echo",
"guest_calm": "vivi",
}
SAMPLES_DIR = os.path.join("tts", "voices", "samples")
PRESETS_DIR = os.path.join("tts", "voices")
def _fetch(url: str) -> bytes:
with urllib.request.urlopen(url, timeout=60) as r:
return r.read()
def _transcripts(spk: str) -> dict:
raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore")
out = {}
for line in raw.splitlines():
line = line.strip()
if not line.startswith("(") or '"' not in line:
continue
try:
uid = line.split()[1]
out[uid] = line[line.index('"') + 1 : line.rindex('"')]
except Exception:
continue
return out
def build():
os.makedirs(SAMPLES_DIR, exist_ok=True)
for vid, spk in VOICES.items():
txts = _transcripts(spk)
chunks, texts, sr = [], [], 16000
for uid in UTTS:
audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")),
dtype="float32")
if audio.ndim > 1:
audio = audio.mean(axis=1)
chunks.append(audio)
chunks.append(np.zeros(int(sr * GAP_S), dtype="float32"))
if uid in txts:
texts.append(txts[uid])
full = np.concatenate(chunks)
peak = float(np.abs(full).max()) or 1.0
full = (full / peak * 0.95).astype("float32")
sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr)
with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f:
f.write(" ".join(texts))
print(f" sample {vid:6s} <- {spk} {len(full)/sr:4.1f}s")
for pid, src in PRESETS.items():
shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"),
os.path.join(PRESETS_DIR, f"{pid}.wav"))
shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"),
os.path.join(PRESETS_DIR, f"{pid}.txt"))
print(f" preset {pid:15s} <- {src}")
if __name__ == "__main__":
build()
print("done.")
|