Spaces:
Running on Zero
Running on Zero
GovIndLok
feat: update TTS to bark-small and other updates in associated project documentation, added links required for submission
2864ffb | import numpy as np | |
| from scipy.io import wavfile | |
| from scipy.signal import butter, lfilter, sawtooth, square | |
| import random | |
| def build_bandpass_filter(lowcut, highcut, fs, order=2): | |
| """Formant-style bandpass for droid vocal resonances""" | |
| nyquist = 0.5 * fs | |
| low = lowcut / nyquist | |
| high = highcut / nyquist | |
| b, a = butter(order, [low, high], btype="band", analog=False) | |
| return b, a | |
| def build_lowpass_filter(cutoff, fs, order=4): | |
| nyquist = 0.5 * fs | |
| normal_cutoff = cutoff / nyquist | |
| b, a = butter(order, normal_cutoff, btype="low", analog=False) | |
| return b, a | |
| def build_noise_filter(fs, cutoff=2200.0, order=2): | |
| nyquist = 0.5 * fs | |
| safe_cutoff = min(cutoff, nyquist * 0.9) | |
| return build_lowpass_filter(safe_cutoff, fs, order=order) | |
| def droid_synth_array(fs, voice_data, droid_type="sml"): | |
| """ | |
| droid_type: sml, chop, agressor | |
| """ | |
| if len(voice_data.shape) > 1: | |
| voice_data = voice_data[:, 0] | |
| voice_data = voice_data.astype(np.float32) | |
| max_val = np.max(np.abs(voice_data)) | |
| print(f"[DEBUG] Input max amplitude: {max_val}") | |
| if max_val < 1e-6: | |
| print("ERROR: Input audio is silent (max amplitude < 1e-6)") | |
| print("Check your TTS model output or input file.") | |
| return | |
| voice_data = voice_data / max_val | |
| print( | |
| f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]" | |
| ) | |
| total_sample = len(voice_data) | |
| print( | |
| f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s" | |
| ) | |
| env_b, env_a = build_lowpass_filter(150.0, fs, order=2) | |
| envelope = np.abs(voice_data) | |
| envelope = lfilter(env_b, env_a, envelope) | |
| envelope_mean = np.mean(envelope) | |
| if envelope_mean > 0: | |
| boost_factor = 0.5 / envelope_mean | |
| boost_factor = np.clip(boost_factor, 1, 20) | |
| else: | |
| boost_factor = 1.0 | |
| envelope = np.clip(envelope * boost_factor, 0, 1) | |
| print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x") | |
| print( | |
| f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}" | |
| ) | |
| print( | |
| f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}" | |
| ) | |
| carrier = np.zeros(total_sample) | |
| presets = { | |
| "sml": { | |
| "waveform": "square", | |
| "base_freq": 800, | |
| "freq_range": (600, 2400), | |
| "jump_interval": (80, 250), | |
| "duty_cycle": 0.3, | |
| "formants": [(700, 1200), (2000, 3500)], | |
| "noise_mix": 0.01, | |
| "bit_depth": 8, | |
| }, | |
| "chop": { | |
| "waveform": "saw", | |
| "base_freq": 200, | |
| "freq_range": (100, 1200), | |
| "jump_interval": (30, 120), | |
| "duty_cycle": 0.5, | |
| "formants": [(300, 900), (1000, 2500)], | |
| "noise_mix": 0.02, | |
| "bit_depth": 4, | |
| }, | |
| "agressor": { | |
| "waveform": "square", | |
| "base_freq": 120, | |
| "freq_range": (60, 400), | |
| "jump_interval": (150, 500), | |
| "duty_cycle": 0.5, | |
| "formants": [(150, 400), (600, 1200)], | |
| "noise_mix": 0.03, | |
| "bit_depth": 6, | |
| }, | |
| } | |
| p = presets.get(droid_type, presets["sml"]) | |
| print(f"[DEBUG] Using preset: {droid_type}") | |
| phase = 0.0 | |
| current_freq = p["base_freq"] | |
| sample_until_tick = 0 | |
| for i in range(total_sample): | |
| if sample_until_tick <= 0: | |
| if droid_type == "sml": | |
| interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75]) | |
| target = np.clip(current_freq * interval, *p["freq_range"]) | |
| else: | |
| target = random.uniform(*p["freq_range"]) | |
| period_ms = random.uniform(*p["jump_interval"]) | |
| period_samp = max(1, int(fs * period_ms / 1000.0)) | |
| sample_until_tick = period_samp | |
| if random.random() > 0.3: | |
| freq_step = (target - current_freq) / period_samp | |
| else: | |
| current_freq = target | |
| freq_step = 0.0 | |
| current_freq += freq_step | |
| sample_until_tick -= 1 | |
| phase += 2.0 * np.pi * current_freq / fs | |
| if phase >= 2.0 * np.pi: | |
| phase -= 2.0 * np.pi | |
| if p["waveform"] == "square": | |
| sample = square(phase, duty=p["duty_cycle"]) | |
| else: | |
| sample = sawtooth(phase) | |
| gate = 1.0 if envelope[i] > 0.05 else 0.0 | |
| carrier[i] = sample * gate | |
| print( | |
| f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}" | |
| ) | |
| filtered = np.zeros_like(carrier) | |
| for low, high in p["formants"]: | |
| b, a = build_bandpass_filter(low, high, fs, order=3) | |
| filtered += lfilter(b, a, carrier) | |
| filtered = filtered / len(p["formants"]) | |
| print( | |
| f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}" | |
| ) | |
| output = filtered * envelope | |
| print( | |
| f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}" | |
| ) | |
| if p["noise_mix"] > 0: | |
| # Keep the noise tied to voiced regions and remove the harsh wideband hiss. | |
| noise = np.random.normal(0.0, 1.0, total_sample) | |
| nb, na = build_noise_filter(fs) | |
| noise = lfilter(nb, na, noise) | |
| noise_max = np.max(np.abs(noise)) | |
| if noise_max > 1e-6: | |
| noise = noise / noise_max | |
| noise *= p["noise_mix"] * envelope | |
| output += noise | |
| print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}") | |
| max_val = 2 ** (p["bit_depth"] - 1) | |
| output = np.round(output * max_val) / max_val | |
| print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}") | |
| hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False) | |
| output = lfilter(hb, ha, output) | |
| print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}") | |
| if np.any(np.isnan(output)): | |
| print("WARNING: Output contains NaN values. Replacing with zeros.") | |
| output = np.nan_to_num(output, nan=0.0) | |
| max_val = np.max(np.abs(output)) | |
| print(f"[DEBUG] Final max amplitude: {max_val:.4f}") | |
| if max_val > 1e-6: | |
| output = np.int16((output / max_val) * 32767 * 0.9) | |
| else: | |
| print("WARNING: Output is silent. Check your settings.") | |
| output = np.int16(output * 32767) | |
| print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}") | |
| return fs, output | |
| def droid_synth(input_wav_path, output_wav_path, droid_type="sml"): | |
| """ | |
| droid_type: sml, chop, agressor | |
| """ | |
| print("Loading the file...") | |
| fs, voice_data = wavfile.read(input_wav_path) | |
| fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type) | |
| wavfile.write(output_wav_path, fs, output) | |
| print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}") | |
| if __name__ == "__main__": | |
| droid_synth("input.wav", "output.wav", droid_type="sml") | |