Spaces:

build-small-hackathon
/

podify

Running on Zero

File size: 3,612 Bytes

310ca13

"""Build voice sample / reference clips from CMU ARCTIC.

fish-speech / OpenAudio ship no reference audio, so we source short, permissively
licensed clips from CMU ARCTIC (festvox.org — "distributed with no restrictions for
academic, research and commercial use"). Each voice clip concatenates a few ARCTIC
sentences (~20s) so it doubles as a clean cloning reference for OpenAudio S1-mini.

Outputs:
  tts/voices/samples/<voice>.wav + .txt   — played on the Voices page
  tts/voices/<preset>.wav + .txt          — picked up by tts/voices.py PRESETS so the
                                            podcast hosts actually sound distinct

Run from the repo root:  python scripts/build_voice_samples.py
WAVs are tracked via Git LFS (see .gitattributes).
"""

from __future__ import annotations

import io
import os
import shutil
import urllib.request

import numpy as np
import soundfile as sf

BASE = "http://festvox.org/cmu_arctic/cmu_arctic"
UTTS = [f"arctic_a{n:04d}" for n in range(1, 7)]  # 6 sentences ≈ 20s
GAP_S = 0.15

# Voices page id -> CMU ARCTIC speaker (distinct timbre/accent each).
VOICES = {
    "nova": "slt",   # female, US
    "atlas": "awb",  # male, Scottish
    "echo": "bdl",   # male, US
    "sage": "jmk",   # male, Canadian
    "vivi": "clb",   # female, US
    "onyx": "rms",   # male, US
    "rune": "ksp",   # male, Indian
}

# Functional presets in tts/voices.py -> reuse a display clip as the cloning reference.
PRESETS = {
    "narrator_warm": "nova",
    "host_energetic": "echo",
    "guest_calm": "vivi",
}

SAMPLES_DIR = os.path.join("tts", "voices", "samples")
PRESETS_DIR = os.path.join("tts", "voices")


def _fetch(url: str) -> bytes:
    with urllib.request.urlopen(url, timeout=60) as r:
        return r.read()


def _transcripts(spk: str) -> dict:
    raw = _fetch(f"{BASE}/cmu_us_{spk}_arctic/etc/txt.done.data").decode("utf-8", "ignore")
    out = {}
    for line in raw.splitlines():
        line = line.strip()
        if not line.startswith("(") or '"' not in line:
            continue
        try:
            uid = line.split()[1]
            out[uid] = line[line.index('"') + 1 : line.rindex('"')]
        except Exception:
            continue
    return out


def build():
    os.makedirs(SAMPLES_DIR, exist_ok=True)
    for vid, spk in VOICES.items():
        txts = _transcripts(spk)
        chunks, texts, sr = [], [], 16000
        for uid in UTTS:
            audio, sr = sf.read(io.BytesIO(_fetch(f"{BASE}/cmu_us_{spk}_arctic/wav/{uid}.wav")),
                                dtype="float32")
            if audio.ndim > 1:
                audio = audio.mean(axis=1)
            chunks.append(audio)
            chunks.append(np.zeros(int(sr * GAP_S), dtype="float32"))
            if uid in txts:
                texts.append(txts[uid])
        full = np.concatenate(chunks)
        peak = float(np.abs(full).max()) or 1.0
        full = (full / peak * 0.95).astype("float32")
        sf.write(os.path.join(SAMPLES_DIR, f"{vid}.wav"), full, sr)
        with open(os.path.join(SAMPLES_DIR, f"{vid}.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(texts))
        print(f"  sample {vid:6s} <- {spk}  {len(full)/sr:4.1f}s")

    for pid, src in PRESETS.items():
        shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.wav"),
                        os.path.join(PRESETS_DIR, f"{pid}.wav"))
        shutil.copyfile(os.path.join(SAMPLES_DIR, f"{src}.txt"),
                        os.path.join(PRESETS_DIR, f"{pid}.txt"))
        print(f"  preset {pid:15s} <- {src}")


if __name__ == "__main__":
    build()
    print("done.")