Spaces:
Running on Zero
Running on Zero
File size: 7,107 Bytes
02008ac 9abe845 02008ac 9abe845 02008ac 2651616 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 2651616 9abe845 2651616 9abe845 2651616 9abe845 02008ac 9abe845 02008ac 9abe845 2864ffb 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 02008ac 9abe845 2651616 9abe845 2651616 9abe845 02008ac 9abe845 2651616 02008ac 9abe845 02008ac 9abe845 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | import numpy as np
from scipy.io import wavfile
from scipy.signal import butter, lfilter, sawtooth, square
import random
def build_bandpass_filter(lowcut, highcut, fs, order=2):
"""Formant-style bandpass for droid vocal resonances"""
nyquist = 0.5 * fs
low = lowcut / nyquist
high = highcut / nyquist
b, a = butter(order, [low, high], btype="band", analog=False)
return b, a
def build_lowpass_filter(cutoff, fs, order=4):
nyquist = 0.5 * fs
normal_cutoff = cutoff / nyquist
b, a = butter(order, normal_cutoff, btype="low", analog=False)
return b, a
def build_noise_filter(fs, cutoff=2200.0, order=2):
nyquist = 0.5 * fs
safe_cutoff = min(cutoff, nyquist * 0.9)
return build_lowpass_filter(safe_cutoff, fs, order=order)
def droid_synth_array(fs, voice_data, droid_type="sml"):
"""
droid_type: sml, chop, agressor
"""
if len(voice_data.shape) > 1:
voice_data = voice_data[:, 0]
voice_data = voice_data.astype(np.float32)
max_val = np.max(np.abs(voice_data))
print(f"[DEBUG] Input max amplitude: {max_val}")
if max_val < 1e-6:
print("ERROR: Input audio is silent (max amplitude < 1e-6)")
print("Check your TTS model output or input file.")
return
voice_data = voice_data / max_val
print(
f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]"
)
total_sample = len(voice_data)
print(
f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s"
)
env_b, env_a = build_lowpass_filter(150.0, fs, order=2)
envelope = np.abs(voice_data)
envelope = lfilter(env_b, env_a, envelope)
envelope_mean = np.mean(envelope)
if envelope_mean > 0:
boost_factor = 0.5 / envelope_mean
boost_factor = np.clip(boost_factor, 1, 20)
else:
boost_factor = 1.0
envelope = np.clip(envelope * boost_factor, 0, 1)
print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x")
print(
f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}"
)
print(
f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}"
)
carrier = np.zeros(total_sample)
presets = {
"sml": {
"waveform": "square",
"base_freq": 800,
"freq_range": (600, 2400),
"jump_interval": (80, 250),
"duty_cycle": 0.3,
"formants": [(700, 1200), (2000, 3500)],
"noise_mix": 0.01,
"bit_depth": 8,
},
"chop": {
"waveform": "saw",
"base_freq": 200,
"freq_range": (100, 1200),
"jump_interval": (30, 120),
"duty_cycle": 0.5,
"formants": [(300, 900), (1000, 2500)],
"noise_mix": 0.02,
"bit_depth": 4,
},
"agressor": {
"waveform": "square",
"base_freq": 120,
"freq_range": (60, 400),
"jump_interval": (150, 500),
"duty_cycle": 0.5,
"formants": [(150, 400), (600, 1200)],
"noise_mix": 0.03,
"bit_depth": 6,
},
}
p = presets.get(droid_type, presets["sml"])
print(f"[DEBUG] Using preset: {droid_type}")
phase = 0.0
current_freq = p["base_freq"]
sample_until_tick = 0
for i in range(total_sample):
if sample_until_tick <= 0:
if droid_type == "sml":
interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75])
target = np.clip(current_freq * interval, *p["freq_range"])
else:
target = random.uniform(*p["freq_range"])
period_ms = random.uniform(*p["jump_interval"])
period_samp = max(1, int(fs * period_ms / 1000.0))
sample_until_tick = period_samp
if random.random() > 0.3:
freq_step = (target - current_freq) / period_samp
else:
current_freq = target
freq_step = 0.0
current_freq += freq_step
sample_until_tick -= 1
phase += 2.0 * np.pi * current_freq / fs
if phase >= 2.0 * np.pi:
phase -= 2.0 * np.pi
if p["waveform"] == "square":
sample = square(phase, duty=p["duty_cycle"])
else:
sample = sawtooth(phase)
gate = 1.0 if envelope[i] > 0.05 else 0.0
carrier[i] = sample * gate
print(
f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}"
)
filtered = np.zeros_like(carrier)
for low, high in p["formants"]:
b, a = build_bandpass_filter(low, high, fs, order=3)
filtered += lfilter(b, a, carrier)
filtered = filtered / len(p["formants"])
print(
f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}"
)
output = filtered * envelope
print(
f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}"
)
if p["noise_mix"] > 0:
# Keep the noise tied to voiced regions and remove the harsh wideband hiss.
noise = np.random.normal(0.0, 1.0, total_sample)
nb, na = build_noise_filter(fs)
noise = lfilter(nb, na, noise)
noise_max = np.max(np.abs(noise))
if noise_max > 1e-6:
noise = noise / noise_max
noise *= p["noise_mix"] * envelope
output += noise
print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}")
max_val = 2 ** (p["bit_depth"] - 1)
output = np.round(output * max_val) / max_val
print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}")
hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False)
output = lfilter(hb, ha, output)
print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}")
if np.any(np.isnan(output)):
print("WARNING: Output contains NaN values. Replacing with zeros.")
output = np.nan_to_num(output, nan=0.0)
max_val = np.max(np.abs(output))
print(f"[DEBUG] Final max amplitude: {max_val:.4f}")
if max_val > 1e-6:
output = np.int16((output / max_val) * 32767 * 0.9)
else:
print("WARNING: Output is silent. Check your settings.")
output = np.int16(output * 32767)
print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}")
return fs, output
def droid_synth(input_wav_path, output_wav_path, droid_type="sml"):
"""
droid_type: sml, chop, agressor
"""
print("Loading the file...")
fs, voice_data = wavfile.read(input_wav_path)
fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type)
wavfile.write(output_wav_path, fs, output)
print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}")
if __name__ == "__main__":
droid_synth("input.wav", "output.wav", droid_type="sml")
|