DoodleBook / audio_utils.py
Codex
Add Bedtime Voice tab: Kannada narration, voice cloning, richer story engine
56412a3
Raw
History Blame Contribute Delete
1.88 kB
"""Audio preparation helpers for voice reference clips."""
from __future__ import annotations
import os
import tempfile
import librosa
import numpy as np
import soundfile as sf
REFERENCE_SAMPLE_RATE = 16_000
MIN_REFERENCE_SECONDS = 5.0
MAX_REFERENCE_SECONDS = 60.0
MIN_RMS = 0.005
def prepare_reference(path: str) -> str:
"""Clean and validate a voice reference clip for VoxCPM2.
Returns a temporary mono 16 kHz WAV path owned by the caller.
"""
if not path or not os.path.exists(path):
raise ValueError("Please record or upload a voice clip first.")
try:
audio, _ = librosa.load(path, sr=REFERENCE_SAMPLE_RATE, mono=True)
except Exception as exc:
raise ValueError("Could not read that audio clip. Please try a WAV or MP3.") from exc
if audio.size == 0 or not np.isfinite(audio).all():
raise ValueError("Voice clip looks empty. Please record 5–60 seconds of clear speech.")
audio = np.asarray(audio, dtype=np.float32)
audio, _ = librosa.effects.trim(audio, top_db=35)
if audio.size == 0:
raise ValueError("Voice clip is too quiet. Please record closer to the microphone.")
peak = float(np.max(np.abs(audio)))
rms = float(np.sqrt(np.mean(np.square(audio))))
if peak <= 0.0 or rms < MIN_RMS:
raise ValueError("Voice clip is too quiet. Please record in a quiet room.")
duration = audio.size / REFERENCE_SAMPLE_RATE
if duration < MIN_REFERENCE_SECONDS:
raise ValueError("Please record at least 5 seconds of clear speech.")
if duration > MAX_REFERENCE_SECONDS:
audio = audio[:int(MAX_REFERENCE_SECONDS * REFERENCE_SAMPLE_RATE)]
audio = np.clip(audio, -1.0, 1.0)
fd, out_path = tempfile.mkstemp(prefix="bedvoice_ref_", suffix=".wav")
os.close(fd)
sf.write(out_path, audio, REFERENCE_SAMPLE_RATE, subtype="PCM_16")
return out_path