Spaces:

yoyolicoris
/

diffvox-ito

Running on Zero

App Files Files Community

yoyolicoris commited on Jul 9, 2025

Commit

48e1ce4

1 Parent(s): 0c4dc06

feat: add regression model and update inference methods

Browse files

Files changed (3) hide show

app.py +73 -22
ito.py +4 -20
modules/model.py +33 -0

app.py CHANGED Viewed

@@ -19,7 +19,12 @@ from typing import Tuple, List, Optional, Union, Callable
 from modules.utils import vec2statedict, get_chunks
 from modules.fx import clip_delay_eq_Q, hadamard
-from utils import get_log_mags_from_eq, chain_functions
 from ito import find_closest_training_sample, one_evaluation
 from st_ito.utils import (
     load_param_model,
@@ -62,6 +67,7 @@ PRESET_PATH = {
     "internal": Path("presets/internal/"),
     "medleydb": Path("presets/medleydb/"),
 }
 PCA_PARAM_FILE = "gaussian.npz"
 INFO_PATH = "info.json"
@@ -100,6 +106,7 @@ def logp_x(mu, cov, cov_logdet, x):
     diff = x - mu
     b = torch.linalg.solve(cov, diff)
     norm = diff @ b
     return -0.5 * (norm + cov_logdet + mu.shape[0] * math.log(2 * math.pi))
@@ -168,6 +175,32 @@ def get_embedding_model(embedding: str) -> Callable:
     return two_chs_emb_fn
 def convert2float(sr: int, x: np.ndarray) -> np.ndarray:
     if sr != 44100:
         x = resample(x, sr, 44100)
@@ -200,6 +233,10 @@ def inference(
     ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
     ref = torch.from_numpy(ref).float().T.unsqueeze(0).to(device)
     y = convert2float(*input_audio)
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
@@ -219,7 +256,13 @@ def inference(
     match method:
         case "Nearest Neighbour":
             vec = find_closest_training_sample(
-                fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
             )
         case "ST-ITO":
@@ -228,7 +271,7 @@ def inference(
                 two_chs_emb_fn,
                 to_fx_state_dict,
                 partial(logp_x, *[x.to(device) for x in gaussian_params_dict[dataset]]),
-                internal_mean.to(device),
                 ref,
                 y,
                 optimiser_type=optimiser,
@@ -438,13 +481,7 @@ with gr.Blocks() as demo:
     # fx = vec2fx(fx_params.value)
     # sr, y = read(EXAMPLE_PATH)
-    default_pc_slider = partial(
-        gr.Slider, minimum=SLIDER_MIN, maximum=SLIDER_MAX, interactive=True, value=0
-    )
     default_audio_block = partial(gr.Audio, type="numpy", loop=True)
-    default_freq_slider = partial(gr.Slider, label="Frequency (Hz)", interactive=True)
-    default_gain_slider = partial(gr.Slider, label="Gain (dB)", interactive=True)
-    default_q_slider = partial(gr.Slider, label="Q", interactive=True)
     gr.Markdown(
         title_md,
@@ -468,6 +505,15 @@ with gr.Blocks() as demo:
                 sources="upload",
                 label="Reference Audio",
             )
         with gr.Column():
             audio_output = default_audio_block(label="Output Audio", interactive=False)
@@ -481,18 +527,8 @@ with gr.Blocks() as demo:
             direct_output = default_audio_block(label="Direct Audio", interactive=False)
             wet_output = default_audio_block(label="Wet Audio", interactive=False)
     with gr.Row():
-        process_button = gr.Button("Run", elem_id="render-button", variant="primary")
-        reset_button = gr.Button("Reset", elem_id="reset-button")
-    _ = gr.Markdown("## Common Parameters")
-    with gr.Row():
-        method_dropdown = gr.Dropdown(
-            ["Mean", "Nearest Neighbour", "ST-ITO", "Regression"],
-            value="ST-ITO",
-            label=f"Style Transfer Method",
-            interactive=True,
-        )
         dataset_dropdown = gr.Dropdown(
             [("Internal", "internal"), ("MedleyDB", "medleydb")],
             label="Prior Distribution",
@@ -503,7 +539,7 @@ with gr.Blocks() as demo:
         embedding_dropdown = gr.Dropdown(
             [("AFx-Rep", "afx-rep"), ("MFCC", "mfcc"), ("MIR Features", "mir")],
             label="Embedding Model",
-            info="This parameter is used in the Nearest Neighbour and ST-ITO methods.",
             value="afx-rep",
             interactive=True,
         )
@@ -516,6 +552,7 @@ with gr.Blocks() as demo:
             )
             mid_side_checkbox = gr.Checkbox(
                 label="Use Mid-Side Processing",
                 value=True,
                 interactive=True,
             )
@@ -524,7 +561,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         optimisation_steps = gr.Slider(
             minimum=1,
-            maximum=100,
             value=100,
             step=1,
             label="Number of Optimisation Steps",
@@ -620,5 +657,19 @@ with gr.Blocks() as demo:
         ],
     )
 demo.launch()

 from modules.utils import vec2statedict, get_chunks
 from modules.fx import clip_delay_eq_Q, hadamard
+from utils import (
+    get_log_mags_from_eq,
+    chain_functions,
+    remove_window_fn,
+    jsonparse2hydra,
+)
 from ito import find_closest_training_sample, one_evaluation
 from st_ito.utils import (
     load_param_model,
     "internal": Path("presets/internal/"),
     "medleydb": Path("presets/medleydb/"),
 }
+CKPT_PATH = Path("reg-ckpts/")
 PCA_PARAM_FILE = "gaussian.npz"
 INFO_PATH = "info.json"
     diff = x - mu
     b = torch.linalg.solve(cov, diff)
     norm = diff @ b
+    assert torch.all(norm >= 0), "Negative norm detected, check covariance matrix."
     return -0.5 * (norm + cov_logdet + mu.shape[0] * math.log(2 * math.pi))
     return two_chs_emb_fn
+def get_regressor() -> Callable:
+    with open(CKPT_PATH / "config.yaml") as f:
+        config = yaml.safe_load(f)
+    model_config = config["model"]
+    checkpoints = (CKPT_PATH / "checkpoints").glob("*val_loss*.ckpt")
+    lowest_checkpoint = min(checkpoints, key=lambda x: float(x.stem.split("=")[-1]))
+    last_ckpt = torch.load(lowest_checkpoint, map_location="cpu")
+    model = chain_functions(remove_window_fn, jsonparse2hydra, instantiate)(
+        model_config
+    )
+    model.load_state_dict(last_ckpt["state_dict"])
+    device = Path("DEVICE.txt").read_text()
+    model = model.to(device)
+    model.eval()
+    param_stats = torch.load(CKPT_PATH / "param_stats.pt")
+    param_mu, param_std = (
+        param_stats["mu"].float().to(device),
+        param_stats["std"].float().to(device),
+    )
+    regressor = lambda wet: model(wet, dry=None) * param_std + param_mu
+    return regressor
 def convert2float(sr: int, x: np.ndarray) -> np.ndarray:
     if sr != 44100:
         x = resample(x, sr, 44100)
     ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
     ref = torch.from_numpy(ref).float().T.unsqueeze(0).to(device)
+    if method == "Regression":
+        regressor = get_regressor()
+        return regressor(ref).mean(0)
     y = convert2float(*input_audio)
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
     match method:
         case "Nearest Neighbour":
             vec = find_closest_training_sample(
+                fx,
+                two_chs_emb_fn,
+                to_fx_state_dict,
+                preset_dict[dataset].to(device),
+                ref,
+                y,
+                progress,
             )
         case "ST-ITO":
                 two_chs_emb_fn,
                 to_fx_state_dict,
                 partial(logp_x, *[x.to(device) for x in gaussian_params_dict[dataset]]),
+                gaussian_params_dict[dataset][0].to(device),
                 ref,
                 y,
                 optimiser_type=optimiser,
     # fx = vec2fx(fx_params.value)
     # sr, y = read(EXAMPLE_PATH)
     default_audio_block = partial(gr.Audio, type="numpy", loop=True)
     gr.Markdown(
         title_md,
                 sources="upload",
                 label="Reference Audio",
             )
+            method_dropdown = gr.Dropdown(
+                ["Mean", "Nearest Neighbour", "ST-ITO", "Regression"],
+                value="ST-ITO",
+                label=f"Style Transfer Method",
+                interactive=True,
+            )
+            process_button = gr.Button(
+                "Run", elem_id="render-button", variant="primary"
+            )
         with gr.Column():
             audio_output = default_audio_block(label="Output Audio", interactive=False)
             direct_output = default_audio_block(label="Direct Audio", interactive=False)
             wet_output = default_audio_block(label="Wet Audio", interactive=False)
+    _ = gr.Markdown("## Control Parameters")
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
             [("Internal", "internal"), ("MedleyDB", "medleydb")],
             label="Prior Distribution",
         embedding_dropdown = gr.Dropdown(
             [("AFx-Rep", "afx-rep"), ("MFCC", "mfcc"), ("MIR Features", "mir")],
             label="Embedding Model",
+            info="This parameter has no effect when using the Mean and Regression methods.",
             value="afx-rep",
             interactive=True,
         )
             )
             mid_side_checkbox = gr.Checkbox(
                 label="Use Mid-Side Processing",
+                info="This option has no effect when using the Mean and Regression methods.",
                 value=True,
                 interactive=True,
             )
     with gr.Row():
         optimisation_steps = gr.Slider(
             minimum=1,
+            maximum=2000,
             value=100,
             step=1,
             label="Number of Optimisation Steps",
         ],
     )
+    dry_wet_ratio.input(
+        chain_functions(
+            lambda _, *args: (_, *map(lambda x: x[1] / 32768, args)),
+            lambda ratio, d, w: math.sqrt(2)
+            * (
+                math.cos(ratio * math.pi * 0.5) * d
+                + math.sin(ratio * math.pi * 0.5) * w
+            ),
+            lambda x: (44100, (x * 32768).astype(np.int16)),
+        ),
+        inputs=[dry_wet_ratio, direct_output, wet_output],
+        outputs=[audio_output],
+    )
 demo.launch()

ito.py CHANGED Viewed

@@ -26,25 +26,6 @@ from st_ito.utils import (
 from utils import remove_window_fn, jsonparse2hydra
-def get_reference_query_chunks(dry_audio, wet_audio, chunk_size, sr):
-    dry = dry_audio.unfold(1, chunk_size, chunk_size).transpose(0, 1)
-    wet = wet_audio.unfold(1, chunk_size, chunk_size).transpose(0, 1)
-    max_filtered = F.max_pool1d(wet.mean(1).abs(), int(sr * 0.05), stride=1)
-    active_mask = torch.quantile(max_filtered, 0.5, dim=1) > 0.001  # -60 dB
-    if not active_mask.any():
-        raise ValueError("No active frames")
-    elif active_mask.count_nonzero() < 2:
-        raise ValueError("Too few active frames")
-    dry = dry[active_mask]
-    wet = wet[active_mask]
-    ref_audio = wet[::2].contiguous()
-    raw_audio = dry[1::2].contiguous()
-    return ref_audio, raw_audio
 def logp_y_given_x(y, mu, std):
     cos_dist = torch.arccos(y @ mu)
     return -0.5 * (cos_dist / std).pow(2) - 0.5 * math.log(2 * math.pi) - std.log()
@@ -130,6 +111,7 @@ def find_closest_training_sample(
     training_samples: torch.Tensor,
     ref_audio: torch.Tensor,
     raw_audio: torch.Tensor,
 ) -> torch.Tensor:
     peak_scaler = 1 / ref_audio.abs().max()
@@ -167,7 +149,9 @@ def find_closest_training_sample(
         )
     best_logp, best_param = reduce(
-        reduce_closure, training_samples.unbind(0), (-float("inf"), torch.tensor([]))
     )
     print(f"Best log-likelihood: {best_logp}")
     return best_param

 from utils import remove_window_fn, jsonparse2hydra
 def logp_y_given_x(y, mu, std):
     cos_dist = torch.arccos(y @ mu)
     return -0.5 * (cos_dist / std).pow(2) - 0.5 * math.log(2 * math.pi) - std.log()
     training_samples: torch.Tensor,
     ref_audio: torch.Tensor,
     raw_audio: torch.Tensor,
+    progress,
 ) -> torch.Tensor:
     peak_scaler = 1 / ref_audio.abs().max()
         )
     best_logp, best_param = reduce(
+        reduce_closure,
+        progress.tqdm(training_samples.unbind(0)),
+        (-float("inf"), torch.tensor([])),
     )
     print(f"Best log-likelihood: {best_logp}")
     return best_param

modules/model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from functools import partial, reduce
+from typing import Optional, List
+from torchaudio.transforms import MelSpectrogram, MFCC
+class LogMelSpectrogram(MelSpectrogram):
+    def forward(self, waveform):
+        return super().forward(waveform).add(1e-8).log()
+class LogMFCC(MFCC):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, log_mels=True, **kwargs)
+class LightningSequential(nn.Sequential):
+    def __init__(self, modules: List[nn.Module]):
+        super().__init__(*modules)
+    def forward(self, *args):
+        return reduce(lambda x, f: f(*x) if isinstance(x, tuple) else f(x), self, args)
+class ResidualWrapper(nn.Module):
+    def __init__(self, m: nn.Module):
+        super().__init__()
+        self.m = m
+    def forward(self, x):
+        return x + self.m(x)