"""Audio preparation helpers for voice reference clips.""" from __future__ import annotations import os import tempfile import librosa import numpy as np import soundfile as sf REFERENCE_SAMPLE_RATE = 16_000 MIN_REFERENCE_SECONDS = 5.0 MAX_REFERENCE_SECONDS = 60.0 MIN_RMS = 0.005 def prepare_reference(path: str) -> str: """Clean and validate a voice reference clip for VoxCPM2. Returns a temporary mono 16 kHz WAV path owned by the caller. """ if not path or not os.path.exists(path): raise ValueError("Please record or upload a voice clip first.") try: audio, _ = librosa.load(path, sr=REFERENCE_SAMPLE_RATE, mono=True) except Exception as exc: raise ValueError("Could not read that audio clip. Please try a WAV or MP3.") from exc if audio.size == 0 or not np.isfinite(audio).all(): raise ValueError("Voice clip looks empty. Please record 5–60 seconds of clear speech.") audio = np.asarray(audio, dtype=np.float32) audio, _ = librosa.effects.trim(audio, top_db=35) if audio.size == 0: raise ValueError("Voice clip is too quiet. Please record closer to the microphone.") peak = float(np.max(np.abs(audio))) rms = float(np.sqrt(np.mean(np.square(audio)))) if peak <= 0.0 or rms < MIN_RMS: raise ValueError("Voice clip is too quiet. Please record in a quiet room.") duration = audio.size / REFERENCE_SAMPLE_RATE if duration < MIN_REFERENCE_SECONDS: raise ValueError("Please record at least 5 seconds of clear speech.") if duration > MAX_REFERENCE_SECONDS: audio = audio[:int(MAX_REFERENCE_SECONDS * REFERENCE_SAMPLE_RATE)] audio = np.clip(audio, -1.0, 1.0) fd, out_path = tempfile.mkstemp(prefix="bedvoice_ref_", suffix=".wav") os.close(fd) sf.write(out_path, audio, REFERENCE_SAMPLE_RATE, subtype="PCM_16") return out_path