Voinal / synth.py
GovIndLok
feat: update TTS to bark-small and other updates in associated project documentation, added links required for submission
2864ffb
Raw
History Blame Contribute Delete
7.11 kB
import numpy as np
from scipy.io import wavfile
from scipy.signal import butter, lfilter, sawtooth, square
import random
def build_bandpass_filter(lowcut, highcut, fs, order=2):
"""Formant-style bandpass for droid vocal resonances"""
nyquist = 0.5 * fs
low = lowcut / nyquist
high = highcut / nyquist
b, a = butter(order, [low, high], btype="band", analog=False)
return b, a
def build_lowpass_filter(cutoff, fs, order=4):
nyquist = 0.5 * fs
normal_cutoff = cutoff / nyquist
b, a = butter(order, normal_cutoff, btype="low", analog=False)
return b, a
def build_noise_filter(fs, cutoff=2200.0, order=2):
nyquist = 0.5 * fs
safe_cutoff = min(cutoff, nyquist * 0.9)
return build_lowpass_filter(safe_cutoff, fs, order=order)
def droid_synth_array(fs, voice_data, droid_type="sml"):
"""
droid_type: sml, chop, agressor
"""
if len(voice_data.shape) > 1:
voice_data = voice_data[:, 0]
voice_data = voice_data.astype(np.float32)
max_val = np.max(np.abs(voice_data))
print(f"[DEBUG] Input max amplitude: {max_val}")
if max_val < 1e-6:
print("ERROR: Input audio is silent (max amplitude < 1e-6)")
print("Check your TTS model output or input file.")
return
voice_data = voice_data / max_val
print(
f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]"
)
total_sample = len(voice_data)
print(
f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s"
)
env_b, env_a = build_lowpass_filter(150.0, fs, order=2)
envelope = np.abs(voice_data)
envelope = lfilter(env_b, env_a, envelope)
envelope_mean = np.mean(envelope)
if envelope_mean > 0:
boost_factor = 0.5 / envelope_mean
boost_factor = np.clip(boost_factor, 1, 20)
else:
boost_factor = 1.0
envelope = np.clip(envelope * boost_factor, 0, 1)
print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x")
print(
f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}"
)
print(
f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}"
)
carrier = np.zeros(total_sample)
presets = {
"sml": {
"waveform": "square",
"base_freq": 800,
"freq_range": (600, 2400),
"jump_interval": (80, 250),
"duty_cycle": 0.3,
"formants": [(700, 1200), (2000, 3500)],
"noise_mix": 0.01,
"bit_depth": 8,
},
"chop": {
"waveform": "saw",
"base_freq": 200,
"freq_range": (100, 1200),
"jump_interval": (30, 120),
"duty_cycle": 0.5,
"formants": [(300, 900), (1000, 2500)],
"noise_mix": 0.02,
"bit_depth": 4,
},
"agressor": {
"waveform": "square",
"base_freq": 120,
"freq_range": (60, 400),
"jump_interval": (150, 500),
"duty_cycle": 0.5,
"formants": [(150, 400), (600, 1200)],
"noise_mix": 0.03,
"bit_depth": 6,
},
}
p = presets.get(droid_type, presets["sml"])
print(f"[DEBUG] Using preset: {droid_type}")
phase = 0.0
current_freq = p["base_freq"]
sample_until_tick = 0
for i in range(total_sample):
if sample_until_tick <= 0:
if droid_type == "sml":
interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75])
target = np.clip(current_freq * interval, *p["freq_range"])
else:
target = random.uniform(*p["freq_range"])
period_ms = random.uniform(*p["jump_interval"])
period_samp = max(1, int(fs * period_ms / 1000.0))
sample_until_tick = period_samp
if random.random() > 0.3:
freq_step = (target - current_freq) / period_samp
else:
current_freq = target
freq_step = 0.0
current_freq += freq_step
sample_until_tick -= 1
phase += 2.0 * np.pi * current_freq / fs
if phase >= 2.0 * np.pi:
phase -= 2.0 * np.pi
if p["waveform"] == "square":
sample = square(phase, duty=p["duty_cycle"])
else:
sample = sawtooth(phase)
gate = 1.0 if envelope[i] > 0.05 else 0.0
carrier[i] = sample * gate
print(
f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}"
)
filtered = np.zeros_like(carrier)
for low, high in p["formants"]:
b, a = build_bandpass_filter(low, high, fs, order=3)
filtered += lfilter(b, a, carrier)
filtered = filtered / len(p["formants"])
print(
f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}"
)
output = filtered * envelope
print(
f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}"
)
if p["noise_mix"] > 0:
# Keep the noise tied to voiced regions and remove the harsh wideband hiss.
noise = np.random.normal(0.0, 1.0, total_sample)
nb, na = build_noise_filter(fs)
noise = lfilter(nb, na, noise)
noise_max = np.max(np.abs(noise))
if noise_max > 1e-6:
noise = noise / noise_max
noise *= p["noise_mix"] * envelope
output += noise
print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}")
max_val = 2 ** (p["bit_depth"] - 1)
output = np.round(output * max_val) / max_val
print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}")
hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False)
output = lfilter(hb, ha, output)
print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}")
if np.any(np.isnan(output)):
print("WARNING: Output contains NaN values. Replacing with zeros.")
output = np.nan_to_num(output, nan=0.0)
max_val = np.max(np.abs(output))
print(f"[DEBUG] Final max amplitude: {max_val:.4f}")
if max_val > 1e-6:
output = np.int16((output / max_val) * 32767 * 0.9)
else:
print("WARNING: Output is silent. Check your settings.")
output = np.int16(output * 32767)
print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}")
return fs, output
def droid_synth(input_wav_path, output_wav_path, droid_type="sml"):
"""
droid_type: sml, chop, agressor
"""
print("Loading the file...")
fs, voice_data = wavfile.read(input_wav_path)
fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type)
wavfile.write(output_wav_path, fs, output)
print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}")
if __name__ == "__main__":
droid_synth("input.wav", "output.wav", droid_type="sml")