import numpy as np from scipy.io import wavfile from scipy.signal import butter, lfilter, sawtooth, square import random def build_bandpass_filter(lowcut, highcut, fs, order=2): """Formant-style bandpass for droid vocal resonances""" nyquist = 0.5 * fs low = lowcut / nyquist high = highcut / nyquist b, a = butter(order, [low, high], btype="band", analog=False) return b, a def build_lowpass_filter(cutoff, fs, order=4): nyquist = 0.5 * fs normal_cutoff = cutoff / nyquist b, a = butter(order, normal_cutoff, btype="low", analog=False) return b, a def build_noise_filter(fs, cutoff=2200.0, order=2): nyquist = 0.5 * fs safe_cutoff = min(cutoff, nyquist * 0.9) return build_lowpass_filter(safe_cutoff, fs, order=order) def droid_synth_array(fs, voice_data, droid_type="sml"): """ droid_type: sml, chop, agressor """ if len(voice_data.shape) > 1: voice_data = voice_data[:, 0] voice_data = voice_data.astype(np.float32) max_val = np.max(np.abs(voice_data)) print(f"[DEBUG] Input max amplitude: {max_val}") if max_val < 1e-6: print("ERROR: Input audio is silent (max amplitude < 1e-6)") print("Check your TTS model output or input file.") return voice_data = voice_data / max_val print( f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]" ) total_sample = len(voice_data) print( f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s" ) env_b, env_a = build_lowpass_filter(150.0, fs, order=2) envelope = np.abs(voice_data) envelope = lfilter(env_b, env_a, envelope) envelope_mean = np.mean(envelope) if envelope_mean > 0: boost_factor = 0.5 / envelope_mean boost_factor = np.clip(boost_factor, 1, 20) else: boost_factor = 1.0 envelope = np.clip(envelope * boost_factor, 0, 1) print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x") print( f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}" ) print( f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}" ) carrier = np.zeros(total_sample) presets = { "sml": { "waveform": "square", "base_freq": 800, "freq_range": (600, 2400), "jump_interval": (80, 250), "duty_cycle": 0.3, "formants": [(700, 1200), (2000, 3500)], "noise_mix": 0.01, "bit_depth": 8, }, "chop": { "waveform": "saw", "base_freq": 200, "freq_range": (100, 1200), "jump_interval": (30, 120), "duty_cycle": 0.5, "formants": [(300, 900), (1000, 2500)], "noise_mix": 0.02, "bit_depth": 4, }, "agressor": { "waveform": "square", "base_freq": 120, "freq_range": (60, 400), "jump_interval": (150, 500), "duty_cycle": 0.5, "formants": [(150, 400), (600, 1200)], "noise_mix": 0.03, "bit_depth": 6, }, } p = presets.get(droid_type, presets["sml"]) print(f"[DEBUG] Using preset: {droid_type}") phase = 0.0 current_freq = p["base_freq"] sample_until_tick = 0 for i in range(total_sample): if sample_until_tick <= 0: if droid_type == "sml": interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75]) target = np.clip(current_freq * interval, *p["freq_range"]) else: target = random.uniform(*p["freq_range"]) period_ms = random.uniform(*p["jump_interval"]) period_samp = max(1, int(fs * period_ms / 1000.0)) sample_until_tick = period_samp if random.random() > 0.3: freq_step = (target - current_freq) / period_samp else: current_freq = target freq_step = 0.0 current_freq += freq_step sample_until_tick -= 1 phase += 2.0 * np.pi * current_freq / fs if phase >= 2.0 * np.pi: phase -= 2.0 * np.pi if p["waveform"] == "square": sample = square(phase, duty=p["duty_cycle"]) else: sample = sawtooth(phase) gate = 1.0 if envelope[i] > 0.05 else 0.0 carrier[i] = sample * gate print( f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}" ) filtered = np.zeros_like(carrier) for low, high in p["formants"]: b, a = build_bandpass_filter(low, high, fs, order=3) filtered += lfilter(b, a, carrier) filtered = filtered / len(p["formants"]) print( f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}" ) output = filtered * envelope print( f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}" ) if p["noise_mix"] > 0: # Keep the noise tied to voiced regions and remove the harsh wideband hiss. noise = np.random.normal(0.0, 1.0, total_sample) nb, na = build_noise_filter(fs) noise = lfilter(nb, na, noise) noise_max = np.max(np.abs(noise)) if noise_max > 1e-6: noise = noise / noise_max noise *= p["noise_mix"] * envelope output += noise print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}") max_val = 2 ** (p["bit_depth"] - 1) output = np.round(output * max_val) / max_val print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}") hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False) output = lfilter(hb, ha, output) print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}") if np.any(np.isnan(output)): print("WARNING: Output contains NaN values. Replacing with zeros.") output = np.nan_to_num(output, nan=0.0) max_val = np.max(np.abs(output)) print(f"[DEBUG] Final max amplitude: {max_val:.4f}") if max_val > 1e-6: output = np.int16((output / max_val) * 32767 * 0.9) else: print("WARNING: Output is silent. Check your settings.") output = np.int16(output * 32767) print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}") return fs, output def droid_synth(input_wav_path, output_wav_path, droid_type="sml"): """ droid_type: sml, chop, agressor """ print("Loading the file...") fs, voice_data = wavfile.read(input_wav_path) fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type) wavfile.write(output_wav_path, fs, output) print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}") if __name__ == "__main__": droid_synth("input.wav", "output.wav", droid_type="sml")