Spaces:

build-small-hackathon
/

DoodleBook

Running on Zero

DoodleBook / audio_utils.py

Codex

Add Bedtime Voice tab: Kannada narration, voice cloning, richer story engine

56412a3 19 days ago

1.88 kB

	"""Audio preparation helpers for voice reference clips."""
	from __future__ import annotations
	import os
	import tempfile
	import librosa
	import numpy as np
	import soundfile as sf

	REFERENCE_SAMPLE_RATE = 16_000
	MIN_REFERENCE_SECONDS = 5.0
	MAX_REFERENCE_SECONDS = 60.0
	MIN_RMS = 0.005


	def prepare_reference(path: str) -> str:
	"""Clean and validate a voice reference clip for VoxCPM2.
	Returns a temporary mono 16 kHz WAV path owned by the caller.
	"""
	if not path or not os.path.exists(path):
	raise ValueError("Please record or upload a voice clip first.")
	try:
	audio, _ = librosa.load(path, sr=REFERENCE_SAMPLE_RATE, mono=True)
	except Exception as exc:
	raise ValueError("Could not read that audio clip. Please try a WAV or MP3.") from exc

	if audio.size == 0 or not np.isfinite(audio).all():
	raise ValueError("Voice clip looks empty. Please record 5–60 seconds of clear speech.")

	audio = np.asarray(audio, dtype=np.float32)
	audio, _ = librosa.effects.trim(audio, top_db=35)

	if audio.size == 0:
	raise ValueError("Voice clip is too quiet. Please record closer to the microphone.")

	peak = float(np.max(np.abs(audio)))
	rms = float(np.sqrt(np.mean(np.square(audio))))
	if peak <= 0.0 or rms < MIN_RMS:
	raise ValueError("Voice clip is too quiet. Please record in a quiet room.")

	duration = audio.size / REFERENCE_SAMPLE_RATE
	if duration < MIN_REFERENCE_SECONDS:
	raise ValueError("Please record at least 5 seconds of clear speech.")
	if duration > MAX_REFERENCE_SECONDS:
	audio = audio[:int(MAX_REFERENCE_SECONDS * REFERENCE_SAMPLE_RATE)]

	audio = np.clip(audio, -1.0, 1.0)
	fd, out_path = tempfile.mkstemp(prefix="bedvoice_ref_", suffix=".wav")
	os.close(fd)
	sf.write(out_path, audio, REFERENCE_SAMPLE_RATE, subtype="PCM_16")
	return out_path