Spaces:

build-small-hackathon
/

podify

Running on Zero

podify / scripts /build_voice_samples.py

jayaspjacob

Add playable voice samples + cap episode length at 2 min

310ca13 19 days ago

3.61 kB

	"""Build voice sample / reference clips from CMU ARCTIC.

	fish-speech / OpenAudio ship no reference audio, so we source short, permissively
	licensed clips from CMU ARCTIC (festvox.org — "distributed with no restrictions for
	academic, research and commercial use"). Each voice clip concatenates a few ARCTIC
	sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini.

	Outputs:
	tts/voices/samples/<voice>.wav + .txt — played on the Voices page
	tts/voices/<preset>.wav + .txt — picked up by tts/voices.py PRESETS so the
	podcast hosts actually sound distinct

	Run from the repo root: python scripts/build_voice_samples.py
	WAVs are tracked via Git LFS (see .gitattributes).
	"""

	from __future__ import annotations

	import io
	import os
	import shutil
	import urllib.request

	import numpy as np
	import soundfile as sf

	BASE = "http://festvox.org/cmu_arctic/cmu_arctic"
	UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)] # 6 sentences ≈ 20s
	GAP_S = 0.15

	# Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each).
	VOICES = {
	"nova": "slt", # female, US
	"atlas": "awb", # male, Scottish
	"echo": "bdl", # male, US
	"sage": "jmk", # male, Canadian
	"vivi": "clb", # female, US
	"onyx": "rms", # male, US
	"rune": "ksp", # male, Indian
	}

	# Functional presets in tts/voices.py -> reuse a display clip as the cloning reference.
	PRESETS = {
	"narrator_warm": "nova",
	"host_energetic": "echo",
	"guest_calm": "vivi",
	}

	SAMPLES_DIR = os.path.join("tts", "voices", "samples")
	PRESETS_DIR = os.path.join("tts", "voices")


	def _fetch(url: str) -> bytes:
	with urllib.request.urlopen(url, timeout=60) as r:
	return r.read()


	def _transcripts(spk: str) -> dict:
	raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore")
	out = {}
	for line in raw.splitlines():
	line = line.strip()
	if not line.startswith("(") or '"' not in line:
	continue
	try:
	uid = line.split()[1]
	out[uid] = line[line.index('"') + 1 : line.rindex('"')]
	except Exception:
	continue
	return out


	def build():
	os.makedirs(SAMPLES_DIR, exist_ok=True)
	for vid, spk in VOICES.items():
	txts = _transcripts(spk)
	chunks, texts, sr = [], [], 16000
	for uid in UTTS:
	audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")),
	dtype="float32")
	if audio.ndim > 1:
	audio = audio.mean(axis=1)
	chunks.append(audio)
	chunks.append(np.zeros(int(sr * GAP_S), dtype="float32"))
	if uid in txts:
	texts.append(txts[uid])
	full = np.concatenate(chunks)
	peak = float(np.abs(full).max()) or 1.0
	full = (full / peak * 0.95).astype("float32")
	sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr)
	with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f:
	f.write(" ".join(texts))
	print(f" sample {vid:6s} <- {spk} {len(full)/sr:4.1f}s")

	for pid, src in PRESETS.items():
	shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"),
	os.path.join(PRESETS_DIR, f"{pid}.wav"))
	shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"),
	os.path.join(PRESETS_DIR, f"{pid}.txt"))
	print(f" preset {pid:15s} <- {src}")


	if __name__ == "__main__":
	build()
	print("done.")