Spaces:

yoyolicoris
/

diffvox-ito

Sleeping

App Files Files Community

yoyolicoris commited on Jul 8, 2025

Commit

1bfa935

1 Parent(s): 6d75109

add vocal effects style transfer demo with Gradio interface

Browse files

Files changed (1) hide show

app.py +439 -0

app.py ADDED Viewed

	@@ -0,0 +1,439 @@

+import gradio as gr
+import numpy as np
+from scipy.io.wavfile import read
+import matplotlib.pyplot as plt
+import torch
+from torch import Tensor
+import math
+import yaml
+import json
+import pyloudnorm as pyln
+from hydra.utils import instantiate
+from soxr import resample
+from functools import partial, reduce
+from itertools import accumulate
+from torchcomp import coef2ms, ms2coef
+from copy import deepcopy
+from pathlib import Path
+from typing import Tuple, List, Optional, Union
+from modules.utils import vec2statedict, get_chunks
+from modules.fx import clip_delay_eq_Q
+from plot_utils import get_log_mags_from_eq
+def chain_functions(*functions):
+    return lambda *initial_args: reduce(
+        lambda xs, f: f(*xs) if isinstance(xs, tuple) else f(xs),
+        functions,
+        initial_args,
+    )
+title_md = "# Vocal Effects Style Transfer Demo"
+description_md = """
+This is a demo of the paper [DiffVox: A Differentiable Model for Capturing and Analysing Professional Effects Distributions](https://arxiv.org/abs/2504.14735), accepted at DAFx 2025.
+In this demo, you can upload a raw vocal audio file (in mono) and use our model to apply professional-quality vocal processing by tweaking generated effects settings to enhance your vocals!
+The effects consist of series of EQ, compressor, delay, and reverb.
+The generator is a PCA model derived from 365 vocal effects presets fitted with the same effects chain.
+This interface allows you to control the principal components (PCs) of the generator, randomise them, and render the audio.
+To give you some idea, we empirically found that the first PC controls the amount of reverb and the second PC controls the amount of brightness.
+Note that adding these PCs together does not necessarily mean that their effects are additive in the final audio.
+We found sometimes the effects of least important PCs are more perceptible.
+Try to play around with the sliders and buttons and see what you can come up with!
+> **_Note:_** To upload your own audio, click X on the top right corner of the input audio block.
+"""
+SLIDER_MAX = 3
+SLIDER_MIN = -3
+NUMBER_OF_PCS = 4
+TEMPERATURE = 0.7
+CONFIG_PATH = {
+    "realtime": "presets/rt_config.yaml",
+    "approx": "presets/fx_config.yaml",
+}
+PRESET_PATH = {
+    "internal": Path("presets/internal/"),
+    "medleydb": Path("presets/medleydb/"),
+}
+PCA_PARAM_FILE = "gaussian.npz"
+INFO_PATH = "info.json"
+MASK_PATH = "feature_mask.npy"
+PARAMS_PATH = "raw_params.npy"
+TRAIN_INDEX_PATH = "train_index.npy"
+EXAMPLE_PATH = "eleanor_erased.wav"
+with open(CONFIG_PATH["approx"]) as fp:
+    fx_config = yaml.safe_load(fp)["model"]
+def load_presets(preset_folder: Path) -> Tensor:
+    raw_params = torch.from_numpy(np.load(preset_folder / PARAMS_PATH))
+    feature_mask = torch.from_numpy(np.load(preset_folder / MASK_PATH))
+    train_index_path = preset_folder / TRAIN_INDEX_PATH
+    if train_index_path.exists():
+        train_index = torch.from_numpy(np.load(train_index_path))
+        raw_params = raw_params[train_index]
+    presets = raw_params[:, feature_mask].contiguous()
+    return presets
+def load_gaussian_params(f: Union[Path, str]) -> Tuple[Tensor, Tensor]:
+    gauss_params = np.load(f)
+    mean = torch.from_numpy(gauss_params["mean"]).float()
+    cov = torch.from_numpy(gauss_params["cov"]).float()
+    return mean, cov
+preset_dict = {k: load_presets(v) for k, v in PRESET_PATH.items()}
+gaussian_params_dict = {
+    k: load_gaussian_params(v / PCA_PARAM_FILE) for k, v in PRESET_PATH.items()
+}
+# Global latent variable
+# z = torch.zeros_like(mean)
+with open(PRESET_PATH["internal"] / INFO_PATH) as f:
+    info = json.load(f)
+param_keys = info["params_keys"]
+original_shapes = list(
+    map(lambda lst: lst if len(lst) else [1], info["params_original_shapes"])
+)
+*vec2dict_args, _ = get_chunks(param_keys, original_shapes)
+vec2dict_args = [param_keys, original_shapes] + vec2dict_args
+vec2dict = partial(
+    vec2statedict,
+    **dict(
+        zip(
+            [
+                "keys",
+                "original_shapes",
+                "selected_chunks",
+                "position",
+                "U_matrix_shape",
+            ],
+            vec2dict_args,
+        )
+    ),
+)
+internal_mean = gaussian_params_dict["internal"][0]
+# Global effect
+global_fx = instantiate(fx_config)
+# global_fx.eval()
+global_fx.load_state_dict(vec2dict(internal_mean), strict=False)
+meter = pyln.Meter(44100)
+@torch.no_grad()
+def inference(audio, ratio, fx):
+    sr, y = audio
+    if sr != 44100:
+        y = resample(y, sr, 44100)
+    if y.dtype.kind != "f":
+        y = y / 32768.0
+    if y.ndim == 1:
+        y = y[:, None]
+    loudness = meter.integrated_loudness(y)
+    y = pyln.normalize.loudness(y, loudness, -18.0)
+    y = torch.from_numpy(y).float().T.unsqueeze(0)
+    if y.shape[1] != 1:
+        y = y.mean(dim=1, keepdim=True)
+    direct, wet = fx(y)
+    direct = direct.squeeze(0).T.numpy()
+    wet = wet.squeeze(0).T.numpy()
+    angle = ratio * math.pi * 0.5
+    test_clipping = direct + wet
+    # rendered = fx(y).squeeze(0).T.numpy()
+    if np.max(np.abs(test_clipping)) > 1:
+        scaler = np.max(np.abs(test_clipping))
+        # rendered = rendered / scaler
+        direct = direct / scaler
+        wet = wet / scaler
+    rendered = math.sqrt(2) * (math.cos(angle) * direct + math.sin(angle) * wet)
+    return (
+        (44100, (rendered * 32768).astype(np.int16)),
+        (44100, (direct * 32768).astype(np.int16)),
+        (
+            44100,
+            (wet * 32768).astype(np.int16),
+        ),
+    )
+def model2json(fx):
+    fx_names = ["PK1", "PK2", "LS", "HS", "LP", "HP", "DRC"]
+    results = {k: v.toJSON() for k, v in zip(fx_names, fx)} | {
+        "Panner": fx[7].pan.toJSON()
+    }
+    spatial_fx = {
+        "DLY": fx[7].effects[0].toJSON() | {"LP": fx[7].effects[0].eq.toJSON()},
+        "FDN": fx[7].effects[1].toJSON()
+        | {
+            "Tone correction PEQ": {
+                k: v.toJSON() for k, v in zip(fx_names[:4], fx[7].effects[1].eq)
+            }
+        },
+        "Cross Send (dB)": fx[7].params.sends_0.log10().mul(20).item(),
+    }
+    return {
+        "Direct": results,
+        "Sends": spatial_fx,
+    }
+@torch.no_grad()
+def plot_eq(fx):
+    fig, ax = plt.subplots(figsize=(6, 4), constrained_layout=True)
+    w, eq_log_mags = get_log_mags_from_eq(fx[:6])
+    ax.plot(w, sum(eq_log_mags), color="black", linestyle="-")
+    for i, eq_log_mag in enumerate(eq_log_mags):
+        ax.plot(w, eq_log_mag, "k-", alpha=0.3)
+        ax.fill_between(w, eq_log_mag, 0, facecolor="gray", edgecolor="none", alpha=0.1)
+    ax.set_xlabel("Frequency (Hz)")
+    ax.set_ylabel("Magnitude (dB)")
+    ax.set_xlim(20, 20000)
+    ax.set_ylim(-40, 20)
+    ax.set_xscale("log")
+    ax.grid()
+    return fig
+@torch.no_grad()
+def plot_comp(fx):
+    fig, ax = plt.subplots(figsize=(6, 5), constrained_layout=True)
+    comp = fx[6]
+    cmp_th = comp.params.cmp_th.item()
+    exp_th = comp.params.exp_th.item()
+    cmp_ratio = comp.params.cmp_ratio.item()
+    exp_ratio = comp.params.exp_ratio.item()
+    make_up = comp.params.make_up.item()
+    # print(cmp_ratio, cmp_th, exp_ratio, exp_th, make_up)
+    comp_in = np.linspace(-80, 0, 100)
+    comp_curve = np.where(
+        comp_in > cmp_th,
+        comp_in - (comp_in - cmp_th) * (cmp_ratio - 1) / cmp_ratio,
+        comp_in,
+    )
+    comp_out = (
+        np.where(
+            comp_curve < exp_th,
+            comp_curve - (exp_th - comp_curve) / exp_ratio,
+            comp_curve,
+        )
+        + make_up
+    )
+    ax.plot(comp_in, comp_out, c="black", linestyle="-")
+    ax.plot(comp_in, comp_in, c="r", alpha=0.5)
+    ax.set_xlabel("Input Level (dB)")
+    ax.set_ylabel("Output Level (dB)")
+    ax.set_xlim(-80, 0)
+    ax.set_ylim(-80, 0)
+    ax.grid()
+    return fig
+@torch.no_grad()
+def plot_delay(fx):
+    fig, ax = plt.subplots(figsize=(6, 4), constrained_layout=True)
+    delay = fx[7].effects[0]
+    w, eq_log_mags = get_log_mags_from_eq([delay.eq])
+    log_gain = delay.params.gain.log10().item() * 20
+    d = delay.params.delay.item() / 1000
+    log_mag = sum(eq_log_mags)
+    ax.plot(w, log_mag + log_gain, color="black", linestyle="-")
+    log_feedback = delay.params.feedback.log10().item() * 20
+    for i in range(1, 10):
+        feedback_log_mag = log_mag * (i + 1) + log_feedback * i + log_gain
+        ax.plot(
+            w,
+            feedback_log_mag,
+            c="black",
+            alpha=max(0, (10 - i * d * 4) / 10),
+            linestyle="-",
+        )
+    ax.set_xscale("log")
+    ax.set_xlim(20, 20000)
+    ax.set_ylim(-80, 0)
+    ax.set_xlabel("Frequency (Hz)")
+    ax.set_ylabel("Magnitude (dB)")
+    ax.grid()
+    return fig
+@torch.no_grad()
+def plot_reverb(fx):
+    fig, ax = plt.subplots(figsize=(6, 4), constrained_layout=True)
+    fdn = fx[7].effects[1]
+    w, eq_log_mags = get_log_mags_from_eq(fdn.eq)
+    bc = fdn.params.c.norm() * fdn.params.b.norm()
+    log_bc = torch.log10(bc).item() * 20
+    # eq_log_mags = [x + log_bc / len(eq_log_mags) for x in eq_log_mags]
+    # ax.plot(w, sum(eq_log_mags), color="black", linestyle="-")
+    eq_log_mags = sum(eq_log_mags) + log_bc
+    ax.plot(w, eq_log_mags, color="black", linestyle="-")
+    ax.set_xlabel("Frequency (Hz)")
+    ax.set_ylabel("Magnitude (dB)")
+    ax.set_xlim(20, 20000)
+    ax.set_ylim(-40, 20)
+    ax.set_xscale("log")
+    ax.grid()
+    return fig
+@torch.no_grad()
+def plot_t60(fx):
+    fig, ax = plt.subplots(figsize=(6, 4), constrained_layout=True)
+    fdn = fx[7].effects[1]
+    gamma = fdn.params.gamma.squeeze().numpy()
+    delays = fdn.delays.numpy()
+    w = np.linspace(0, 22050, gamma.size)
+    t60 = -60 / (20 * np.log10(gamma + 1e-10) / np.min(delays)) / 44100
+    ax.plot(w, t60, color="black", linestyle="-")
+    ax.set_xlabel("Frequency (Hz)")
+    ax.set_ylabel("T60 (s)")
+    ax.set_xlim(20, 20000)
+    ax.set_ylim(0, 9)
+    ax.set_xscale("log")
+    ax.grid()
+    return fig
+def vec2fx(x):
+    fx = deepcopy(global_fx)
+    fx.load_state_dict(vec2dict(x), strict=False)
+    fx.apply(partial(clip_delay_eq_Q, Q=0.707))
+    return fx
+with gr.Blocks() as demo:
+    fx_params = gr.State(internal_mean)
+    fx = vec2fx(fx_params.value)
+    # sr, y = read(EXAMPLE_PATH)
+    default_pc_slider = partial(
+        gr.Slider, minimum=SLIDER_MIN, maximum=SLIDER_MAX, interactive=True, value=0
+    )
+    default_audio_block = partial(gr.Audio, type="numpy", loop=True)
+    default_freq_slider = partial(gr.Slider, label="Frequency (Hz)", interactive=True)
+    default_gain_slider = partial(gr.Slider, label="Gain (dB)", interactive=True)
+    default_q_slider = partial(gr.Slider, label="Q", interactive=True)
+    gr.Markdown(
+        title_md,
+        elem_id="title",
+    )
+    with gr.Row():
+        gr.Markdown(
+            description_md,
+            elem_id="description",
+        )
+        # gr.Image("diffvox_diagram.png", elem_id="diagram")
+    with gr.Row():
+        with gr.Column():
+            audio_input = default_audio_block(
+                sources="upload",
+                label="Input Audio",
+                # value=(sr, y)
+            )
+            with gr.Row():
+                reset_button = gr.Button(
+                    "Reset",
+                    elem_id="reset-button",
+                )
+                render_button = gr.Button(
+                    "Run", elem_id="render-button", variant="primary"
+                )
+        with gr.Column():
+            audio_output = default_audio_block(label="Output Audio", interactive=False)
+            dry_wet_ratio = gr.Slider(
+                minimum=0,
+                maximum=1,
+                value=0.5,
+                label="Dry/Wet Ratio",
+                interactive=True,
+            )
+            direct_output = default_audio_block(label="Direct Audio", interactive=False)
+            wet_output = default_audio_block(label="Wet Audio", interactive=False)
+    _ = gr.Markdown("## Common Parameters")
+    with gr.Row():
+        method_dropdown = gr.Dropdown(
+            ["Mean", "Nearest Neighbour", "ST-ITO", "Regression"],
+            value="ST-ITO",
+            label=f"Style Transfer Method",
+            interactive=True,
+        )
+        dataset_dropdown = gr.Dropdown(
+            ["Internal", "MedleyDB"],
+            label="Prior Distribution",
+            info="When using the Regression method, this parameter has no effect as the model is trained on the internal dataset.",
+            value="Internal",
+            interactive=True,
+        )
+        embedding_dropdown = gr.Dropdown(
+            ["AFx-Rep", "MFCC", "MIR Features"],
+            label="Embedding Model",
+            info="This parameter is used in the Nearest Neighbour and ST-ITO methods.",
+            value="AFx-Rep",
+            interactive=True,
+        )
+    _ = gr.Markdown("## Parameters for ST-ITO Method")
+    with gr.Row():
+        optimisation_steps = gr.Slider(
+            minimum=1,
+            maximum=10000,
+            value=1000,
+            label="Number of Optimisation Steps",
+            interactive=True,
+        )
+        prior_weight = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.1,
+            label="Prior Weight",
+            interactive=True,
+        )
+        optimiser_dropdown = gr.Dropdown(
+            [
+                "Adadelta",
+                "Adafactor",
+                "Adagrad",
+                "Adam",
+                "AdamW",
+                "Adamax",
+                "RMSprop",
+                "ASGD",
+                "NAdam",
+                "RAdam",
+                "SGD",
+            ],
+            value="Adam",
+            label="Optimiser",
+            interactive=True,
+        )
+demo.launch()