Spaces:

build-small-hackathon
/

Voinal

Running on Zero

Voinal / synth.py

GovIndLok

feat: update TTS to bark-small and other updates in associated project documentation, added links required for submission

2864ffb 13 days ago

Raw

History Blame Contribute Delete

7.11 kB

	import numpy as np
	from scipy.io import wavfile
	from scipy.signal import butter, lfilter, sawtooth, square
	import random


	def build_bandpass_filter(lowcut, highcut, fs, order=2):
	"""Formant-style bandpass for droid vocal resonances"""
	nyquist = 0.5 * fs
	low = lowcut / nyquist
	high = highcut / nyquist
	b, a = butter(order, [low, high], btype="band", analog=False)
	return b, a


	def build_lowpass_filter(cutoff, fs, order=4):
	nyquist = 0.5 * fs
	normal_cutoff = cutoff / nyquist
	b, a = butter(order, normal_cutoff, btype="low", analog=False)
	return b, a


	def build_noise_filter(fs, cutoff=2200.0, order=2):
	nyquist = 0.5 * fs
	safe_cutoff = min(cutoff, nyquist * 0.9)
	return build_lowpass_filter(safe_cutoff, fs, order=order)


	def droid_synth_array(fs, voice_data, droid_type="sml"):
	"""
	droid_type: sml, chop, agressor
	"""
	if len(voice_data.shape) > 1:
	voice_data = voice_data[:, 0]

	voice_data = voice_data.astype(np.float32)

	max_val = np.max(np.abs(voice_data))
	print(f"[DEBUG] Input max amplitude: {max_val}")

	if max_val < 1e-6:
	print("ERROR: Input audio is silent (max amplitude < 1e-6)")
	print("Check your TTS model output or input file.")
	return

	voice_data = voice_data / max_val
	print(
	f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]"
	)

	total_sample = len(voice_data)

	print(
	f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s"
	)

	env_b, env_a = build_lowpass_filter(150.0, fs, order=2)
	envelope = np.abs(voice_data)
	envelope = lfilter(env_b, env_a, envelope)

	envelope_mean = np.mean(envelope)
	if envelope_mean > 0:
	boost_factor = 0.5 / envelope_mean
	boost_factor = np.clip(boost_factor, 1, 20)
	else:
	boost_factor = 1.0

	envelope = np.clip(envelope * boost_factor, 0, 1)

	print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x")
	print(
	f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}"
	)
	print(
	f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}"
	)

	carrier = np.zeros(total_sample)

	presets = {
	"sml": {
	"waveform": "square",
	"base_freq": 800,
	"freq_range": (600, 2400),
	"jump_interval": (80, 250),
	"duty_cycle": 0.3,
	"formants": [(700, 1200), (2000, 3500)],
	"noise_mix": 0.01,
	"bit_depth": 8,
	},
	"chop": {
	"waveform": "saw",
	"base_freq": 200,
	"freq_range": (100, 1200),
	"jump_interval": (30, 120),
	"duty_cycle": 0.5,
	"formants": [(300, 900), (1000, 2500)],
	"noise_mix": 0.02,
	"bit_depth": 4,
	},
	"agressor": {
	"waveform": "square",
	"base_freq": 120,
	"freq_range": (60, 400),
	"jump_interval": (150, 500),
	"duty_cycle": 0.5,
	"formants": [(150, 400), (600, 1200)],
	"noise_mix": 0.03,
	"bit_depth": 6,
	},
	}

	p = presets.get(droid_type, presets["sml"])
	print(f"[DEBUG] Using preset: {droid_type}")

	phase = 0.0
	current_freq = p["base_freq"]
	sample_until_tick = 0

	for i in range(total_sample):
	if sample_until_tick <= 0:
	if droid_type == "sml":
	interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75])
	target = np.clip(current_freq * interval, *p["freq_range"])
	else:
	target = random.uniform(*p["freq_range"])

	period_ms = random.uniform(*p["jump_interval"])
	period_samp = max(1, int(fs * period_ms / 1000.0))
	sample_until_tick = period_samp

	if random.random() > 0.3:
	freq_step = (target - current_freq) / period_samp
	else:
	current_freq = target
	freq_step = 0.0

	current_freq += freq_step
	sample_until_tick -= 1

	phase += 2.0 * np.pi * current_freq / fs
	if phase >= 2.0 * np.pi:
	phase -= 2.0 * np.pi

	if p["waveform"] == "square":
	sample = square(phase, duty=p["duty_cycle"])
	else:
	sample = sawtooth(phase)

	gate = 1.0 if envelope[i] > 0.05 else 0.0
	carrier[i] = sample * gate

	print(
	f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}"
	)

	filtered = np.zeros_like(carrier)
	for low, high in p["formants"]:
	b, a = build_bandpass_filter(low, high, fs, order=3)
	filtered += lfilter(b, a, carrier)

	filtered = filtered / len(p["formants"])
	print(
	f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}"
	)

	output = filtered * envelope
	print(
	f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}"
	)

	if p["noise_mix"] > 0:
	# Keep the noise tied to voiced regions and remove the harsh wideband hiss.
	noise = np.random.normal(0.0, 1.0, total_sample)
	nb, na = build_noise_filter(fs)
	noise = lfilter(nb, na, noise)
	noise_max = np.max(np.abs(noise))
	if noise_max > 1e-6:
	noise = noise / noise_max
	noise = p["noise_mix"] envelope
	output += noise
	print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}")

	max_val = 2 ** (p["bit_depth"] - 1)
	output = np.round(output * max_val) / max_val
	print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}")

	hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False)
	output = lfilter(hb, ha, output)
	print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}")

	if np.any(np.isnan(output)):
	print("WARNING: Output contains NaN values. Replacing with zeros.")
	output = np.nan_to_num(output, nan=0.0)

	max_val = np.max(np.abs(output))
	print(f"[DEBUG] Final max amplitude: {max_val:.4f}")

	if max_val > 1e-6:
	output = np.int16((output / max_val) * 32767 * 0.9)
	else:
	print("WARNING: Output is silent. Check your settings.")
	output = np.int16(output * 32767)

	print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}")
	return fs, output


	def droid_synth(input_wav_path, output_wav_path, droid_type="sml"):
	"""
	droid_type: sml, chop, agressor
	"""
	print("Loading the file...")
	fs, voice_data = wavfile.read(input_wav_path)
	fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type)
	wavfile.write(output_wav_path, fs, output)
	print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}")


	if __name__ == "__main__":
	droid_synth("input.wav", "output.wav", droid_type="sml")