Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import yaml | |
| import json | |
| import pyloudnorm as pyln | |
| from hydra.utils import instantiate | |
| from random import normalvariate | |
| from soxr import resample | |
| from functools import partial | |
| from modules.utils import chain_functions, vec2statedict, get_chunks | |
| from modules.fx import clip_delay_eq_Q | |
| title_md = "# Vocal Effects Generator" | |
| description_md = """ | |
| This is a demo of the paper [DiffVox: A Differentiable Model for Capturing and Analysing Professional Effects Distributions](https://arxiv.org/abs/2504.14735), accepted at DAFx 2025. | |
| In this demo, you can upload a raw vocal audio file (in mono) and apply random effects to make it sound better! | |
| The effects consist of series of EQ, compressor, delay, and reverb. | |
| The generator is a PCA model derived from 365 vocal effects presets fitted with the same effects chain. | |
| This interface allows you to control the principal components (PCs) of the generator, randomise them, and render the audio. | |
| To give you some idea, we emperically found that the first PC controls the amount of reverb and the second PC controls the amount of brightness. | |
| Note that adding these PCs together does not necessarily mean that their effects are additive in the final audio. | |
| We found sometimes the effects of least important PCs are more perceptible. | |
| Try to play around with the sliders and buttons and see what you can come up with! | |
| Currently only PCs are tweakable, but in the future we will add more controls and visualisation tools. | |
| For example: | |
| - Directly controlling the parameters of the effects | |
| - Visualising the PCA space | |
| - Visualising the frequency responses/dynamic curves of the effects | |
| """ | |
| SLIDER_MAX = 3 | |
| SLIDER_MIN = -3 | |
| NUMBER_OF_PCS = 10 | |
| TEMPERATURE = 0.7 | |
| CONFIG_PATH = "presets/rt_config.yaml" | |
| PCA_PARAM_FILE = "presets/internal/gaussian.npz" | |
| INFO_PATH = "presets/internal/info.json" | |
| with open(CONFIG_PATH) as fp: | |
| fx_config = yaml.safe_load(fp)["model"] | |
| fx = instantiate(fx_config) | |
| fx.eval() | |
| pca_params = np.load(PCA_PARAM_FILE) | |
| mean = pca_params["mean"] | |
| cov = pca_params["cov"] | |
| eigvals, eigvecs = np.linalg.eigh(cov) | |
| eigvals = np.flip(eigvals, axis=0)[:75] | |
| eigvecs = np.flip(eigvecs, axis=1)[:, :75] | |
| U = eigvecs * np.sqrt(eigvals) | |
| U = torch.from_numpy(U).float() | |
| mean = torch.from_numpy(mean).float() | |
| z = torch.zeros(75) | |
| with open(INFO_PATH) as f: | |
| info = json.load(f) | |
| param_keys = info["params_keys"] | |
| original_shapes = list( | |
| map(lambda lst: lst if len(lst) else [1], info["params_original_shapes"]) | |
| ) | |
| *vec2dict_args, _ = get_chunks(param_keys, original_shapes) | |
| vec2dict_args = [param_keys, original_shapes] + vec2dict_args | |
| vec2dict = partial( | |
| vec2statedict, | |
| **dict( | |
| zip( | |
| [ | |
| "keys", | |
| "original_shapes", | |
| "selected_chunks", | |
| "position", | |
| "U_matrix_shape", | |
| ], | |
| vec2dict_args, | |
| ) | |
| ), | |
| ) | |
| meter = pyln.Meter(44100) | |
| def inference(audio): | |
| sr, y = audio | |
| if sr != 44100: | |
| y = resample(y, sr, 44100) | |
| if y.dtype.kind != "f": | |
| y = y / 32768.0 | |
| if y.ndim == 1: | |
| y = y[:, None] | |
| loudness = meter.integrated_loudness(y) | |
| y = pyln.normalize.loudness(y, loudness, -18.0) | |
| y = torch.from_numpy(y).float().T.unsqueeze(0) | |
| if y.shape[1] != 1: | |
| y = y.mean(dim=1, keepdim=True) | |
| # M = eigvals.shape[0] | |
| # z = torch.cat( | |
| # [ | |
| # torch.tensor([float(x) for x in pcs]), | |
| # ( | |
| # torch.randn(M - len(pcs)) * TEMPERATURE | |
| # if randomise_rest | |
| # else torch.zeros(M - len(pcs)) | |
| # ), | |
| # ] | |
| # ) | |
| x = U @ z + mean | |
| # print(z) | |
| fx.load_state_dict(vec2dict(x), strict=False) | |
| fx.apply(partial(clip_delay_eq_Q, Q=0.707)) | |
| rendered = fx(y).squeeze(0).T.numpy() | |
| if np.max(np.abs(rendered)) > 1: | |
| rendered = rendered / np.max(np.abs(rendered)) | |
| return (44100, (rendered * 32768).astype(np.int16)) | |
| def get_important_pcs(n=10, **kwargs): | |
| sliders = [ | |
| gr.Slider(minimum=SLIDER_MIN, maximum=SLIDER_MAX, label=f"PC {i}", **kwargs) | |
| for i in range(1, n + 1) | |
| ] | |
| return sliders | |
| def model2json(): | |
| fx_names = ["PK1", "PK2", "LS", "HS", "LP", "HP", "DRC"] | |
| results = {k: v.toJSON() for k, v in zip(fx_names, fx)} | { | |
| "Panner": fx[7].pan.toJSON() | |
| } | |
| spatial_fx = { | |
| "DLY": fx[7].effects[0].toJSON() | {"LP": fx[7].effects[0].eq.toJSON()}, | |
| "FDN": fx[7].effects[1].toJSON() | |
| | { | |
| "Tone correction PEQ": { | |
| k: v.toJSON() for k, v in zip(fx_names[:4], fx[7].effects[1].eq) | |
| } | |
| }, | |
| "Cross Send (dB)": fx[7].params.sends_0.log10().mul(20).item(), | |
| } | |
| return json.dumps( | |
| { | |
| "Direct": results, | |
| "Sends": spatial_fx, | |
| } | |
| ) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| title_md, | |
| elem_id="title", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown( | |
| description_md, | |
| elem_id="description", | |
| ) | |
| gr.Image("diffvox_diagram.png", elem_id="diagram") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| type="numpy", sources="upload", label="Input Audio", loop=True | |
| ) | |
| with gr.Row(): | |
| random_button = gr.Button( | |
| f"Randomise PCs", | |
| elem_id="randomise-button", | |
| ) | |
| reset_button = gr.Button( | |
| "Reset", | |
| elem_id="reset-button", | |
| ) | |
| render_button = gr.Button( | |
| "Run", elem_id="render-button", variant="primary" | |
| ) | |
| # random_rest_checkbox = gr.Checkbox( | |
| # label=f"Randomise PCs > {NUMBER_OF_PCS} (default to zeros)", | |
| # value=False, | |
| # elem_id="randomise-checkbox", | |
| # ) | |
| sliders = get_important_pcs(NUMBER_OF_PCS, value=0) | |
| extra_pc_dropdown = gr.Dropdown( | |
| list(range(NUMBER_OF_PCS + 1, 76)), | |
| label=f"PC > {NUMBER_OF_PCS}", | |
| info="Select which extra PC to adjust", | |
| interactive=True, | |
| ) | |
| extra_slider = gr.Slider( | |
| minimum=SLIDER_MIN, | |
| maximum=SLIDER_MAX, | |
| label="Extra PC", | |
| value=0, | |
| ) | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| type="numpy", label="Output Audio", interactive=False, loop=True | |
| ) | |
| json_output = gr.JSON(label="Effect Settings", max_height=800, open=True) | |
| render_button.click( | |
| lambda *args: (lambda x: (x, model2json()))(inference(*args)), | |
| inputs=[ | |
| audio_input, | |
| # random_rest_checkbox, | |
| ] | |
| # + sliders, | |
| , | |
| outputs=[audio_output, json_output], | |
| ) | |
| random_button.click( | |
| # lambda *xs: [ | |
| # chain_functions( | |
| # partial(max, SLIDER_MIN), | |
| # partial(min, SLIDER_MAX), | |
| # )(normalvariate(0, 1)) | |
| # for _ in range(len(xs)) | |
| # ], | |
| lambda i: (lambda x: x[:NUMBER_OF_PCS].tolist() + [x[i - 1].item()])( | |
| z.normal_(0, 1).clip_(SLIDER_MIN, SLIDER_MAX) | |
| ), | |
| inputs=extra_pc_dropdown, | |
| outputs=sliders + [extra_slider], | |
| ) | |
| reset_button.click( | |
| lambda *xs: (lambda _: [0 for _ in range(len(xs))])(z.zero_()), | |
| inputs=sliders + [extra_slider], | |
| outputs=sliders + [extra_slider], | |
| ) | |
| def update_z(s, i): | |
| z[i] = s | |
| return | |
| for i, slider in enumerate(sliders): | |
| slider.change(partial(update_z, i=i), inputs=slider) | |
| extra_slider.change( | |
| lambda _, i: update_z(_, i - 1), inputs=[extra_slider, extra_pc_dropdown] | |
| ) | |
| extra_pc_dropdown.change( | |
| lambda i: z[i - 1].item(), | |
| inputs=extra_pc_dropdown, | |
| outputs=extra_slider, | |
| ) | |
| demo.launch() | |