Spaces:

build-small-hackathon
/

Voinal

Running on Zero

File size: 7,107 Bytes

import numpy as np
from scipy.io import wavfile
from scipy.signal import butter, lfilter, sawtooth, square
import random


def build_bandpass_filter(lowcut, highcut, fs, order=2):
    """Formant-style bandpass for droid vocal resonances"""
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype="band", analog=False)
    return b, a


def build_lowpass_filter(cutoff, fs, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype="low", analog=False)
    return b, a


def build_noise_filter(fs, cutoff=2200.0, order=2):
    nyquist = 0.5 * fs
    safe_cutoff = min(cutoff, nyquist * 0.9)
    return build_lowpass_filter(safe_cutoff, fs, order=order)


def droid_synth_array(fs, voice_data, droid_type="sml"):
    """
    droid_type: sml, chop, agressor
    """
    if len(voice_data.shape) > 1:
        voice_data = voice_data[:, 0]

    voice_data = voice_data.astype(np.float32)

    max_val = np.max(np.abs(voice_data))
    print(f"[DEBUG] Input max amplitude: {max_val}")

    if max_val < 1e-6:
        print("ERROR: Input audio is silent (max amplitude < 1e-6)")
        print("Check your TTS model output or input file.")
        return

    voice_data = voice_data / max_val
    print(
        f"[DEBUG] Normalized audio range: [{voice_data.min():.4f}, {voice_data.max():.4f}]"
    )

    total_sample = len(voice_data)

    print(
        f"[DEBUG] Loaded audio: {total_sample} samples, fs={fs} Hz, duration={total_sample / fs:.2f}s"
    )

    env_b, env_a = build_lowpass_filter(150.0, fs, order=2)
    envelope = np.abs(voice_data)
    envelope = lfilter(env_b, env_a, envelope)

    envelope_mean = np.mean(envelope)
    if envelope_mean > 0:
        boost_factor = 0.5 / envelope_mean
        boost_factor = np.clip(boost_factor, 1, 20)
    else:
        boost_factor = 1.0

    envelope = np.clip(envelope * boost_factor, 0, 1)

    print(f"[DEBUG] Envelope boost factor: {boost_factor:.2f}x")
    print(
        f"[DEBUG] Envelope - min: {envelope.min():.4f}, max: {envelope.max():.4f}, mean: {envelope.mean():.4f}"
    )
    print(
        f"[DEBUG] Samples above 0.05 threshold: {np.sum(envelope > 0.05)} / {total_sample}"
    )

    carrier = np.zeros(total_sample)

    presets = {
        "sml": {
            "waveform": "square",
            "base_freq": 800,
            "freq_range": (600, 2400),
            "jump_interval": (80, 250),
            "duty_cycle": 0.3,
            "formants": [(700, 1200), (2000, 3500)],
            "noise_mix": 0.01,
            "bit_depth": 8,
        },
        "chop": {
            "waveform": "saw",
            "base_freq": 200,
            "freq_range": (100, 1200),
            "jump_interval": (30, 120),
            "duty_cycle": 0.5,
            "formants": [(300, 900), (1000, 2500)],
            "noise_mix": 0.02,
            "bit_depth": 4,
        },
        "agressor": {
            "waveform": "square",
            "base_freq": 120,
            "freq_range": (60, 400),
            "jump_interval": (150, 500),
            "duty_cycle": 0.5,
            "formants": [(150, 400), (600, 1200)],
            "noise_mix": 0.03,
            "bit_depth": 6,
        },
    }

    p = presets.get(droid_type, presets["sml"])
    print(f"[DEBUG] Using preset: {droid_type}")

    phase = 0.0
    current_freq = p["base_freq"]
    sample_until_tick = 0

    for i in range(total_sample):
        if sample_until_tick <= 0:
            if droid_type == "sml":
                interval = random.choice([1.0, 1.25, 1.5, 2.0, 0.75])
                target = np.clip(current_freq * interval, *p["freq_range"])
            else:
                target = random.uniform(*p["freq_range"])

            period_ms = random.uniform(*p["jump_interval"])
            period_samp = max(1, int(fs * period_ms / 1000.0))
            sample_until_tick = period_samp

            if random.random() > 0.3:
                freq_step = (target - current_freq) / period_samp
            else:
                current_freq = target
                freq_step = 0.0

        current_freq += freq_step
        sample_until_tick -= 1

        phase += 2.0 * np.pi * current_freq / fs
        if phase >= 2.0 * np.pi:
            phase -= 2.0 * np.pi

        if p["waveform"] == "square":
            sample = square(phase, duty=p["duty_cycle"])
        else:
            sample = sawtooth(phase)

        gate = 1.0 if envelope[i] > 0.05 else 0.0
        carrier[i] = sample * gate

    print(
        f"[DEBUG] Carrier - min: {carrier.min():.4f}, max: {carrier.max():.4f}, nonzero samples: {np.sum(carrier != 0)}"
    )

    filtered = np.zeros_like(carrier)
    for low, high in p["formants"]:
        b, a = build_bandpass_filter(low, high, fs, order=3)
        filtered += lfilter(b, a, carrier)

    filtered = filtered / len(p["formants"])
    print(
        f"[DEBUG] After formants - min: {filtered.min():.4f}, max: {filtered.max():.4f}"
    )

    output = filtered * envelope
    print(
        f"[DEBUG] After envelope modulation - min: {output.min():.4f}, max: {output.max():.4f}"
    )

    if p["noise_mix"] > 0:
        # Keep the noise tied to voiced regions and remove the harsh wideband hiss.
        noise = np.random.normal(0.0, 1.0, total_sample)
        nb, na = build_noise_filter(fs)
        noise = lfilter(nb, na, noise)
        noise_max = np.max(np.abs(noise))
        if noise_max > 1e-6:
            noise = noise / noise_max
        noise *= p["noise_mix"] * envelope
        output += noise
        print(f"[DEBUG] After noise - min: {output.min():.4f}, max: {output.max():.4f}")

    max_val = 2 ** (p["bit_depth"] - 1)
    output = np.round(output * max_val) / max_val
    print(f"[DEBUG] After bit crushing - min: {output.min():.4f}, max: {output.max():.4f}")

    hb, ha = butter(2, 80 / (0.5 * fs), btype="high", analog=False)
    output = lfilter(hb, ha, output)
    print(f"[DEBUG] After highpass - min: {output.min():.4f}, max: {output.max():.4f}")

    if np.any(np.isnan(output)):
        print("WARNING: Output contains NaN values. Replacing with zeros.")
        output = np.nan_to_num(output, nan=0.0)

    max_val = np.max(np.abs(output))
    print(f"[DEBUG] Final max amplitude: {max_val:.4f}")

    if max_val > 1e-6:
        output = np.int16((output / max_val) * 32767 * 0.9)
    else:
        print("WARNING: Output is silent. Check your settings.")
        output = np.int16(output * 32767)

    print(f"[DEBUG] Final output dtype: {output.dtype}, shape: {output.shape}")
    return fs, output


def droid_synth(input_wav_path, output_wav_path, droid_type="sml"):
    """
    droid_type: sml, chop, agressor
    """
    print("Loading the file...")
    fs, voice_data = wavfile.read(input_wav_path)
    fs, output = droid_synth_array(fs, voice_data, droid_type=droid_type)
    wavfile.write(output_wav_path, fs, output)
    print(f"✓ Droid audio ({droid_type}) saved to {output_wav_path}")


if __name__ == "__main__":
    droid_synth("input.wav", "output.wav", droid_type="sml")