Spaces:

yoyolicoris
/

diffvox-ito

Running on Zero

App Files Files Community

yoyolicoris commited on Jul 8, 2025

Commit

40355de

1 Parent(s): a25cbf8

update app.py

Browse files

Files changed (1) hide show

app.py +77 -30

app.py CHANGED Viewed

@@ -15,19 +15,19 @@ from itertools import accumulate
 from torchcomp import coef2ms, ms2coef
 from copy import deepcopy
 from pathlib import Path
-from typing import Tuple, List, Optional, Union
 from modules.utils import vec2statedict, get_chunks
-from modules.fx import clip_delay_eq_Q
-from plot_utils import get_log_mags_from_eq
-def chain_functions(*functions):
-    return lambda *initial_args: reduce(
-        lambda xs, f: f(*xs) if isinstance(xs, tuple) else f(xs),
-        functions,
-        initial_args,
-    )
 title_md = "# Vocal Effects Style Transfer Demo"
@@ -135,34 +135,65 @@ global_fx = instantiate(fx_config)
 # global_fx.eval()
 global_fx.load_state_dict(vec2dict(internal_mean), strict=False)
 meter = pyln.Meter(44100)
 def inference(
-    audio,
     ratio,
     method,
     dataset,
     embedding,
     remove_approx,
     steps,
     prior_weight,
     optimiser,
     lr,
 ):
-    sr, y = audio
-    if sr != 44100:
-        y = resample(y, sr, 44100)
-    if y.dtype.kind != "f":
-        y = y / 32768.0
-    if y.ndim == 1:
-        y = y[:, None]
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
     y = torch.from_numpy(y).float().T.unsqueeze(0)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
@@ -172,8 +203,16 @@ def inference(
     match method:
         case "Mean":
             vec = gaussian_params_dict[dataset][0]
         case _:
-            vec = internal_mean.clone()
     if remove_approx:
         infer_fx = instantiate(rt_config)
@@ -407,8 +446,8 @@ with gr.Blocks() as demo:
             wet_output = default_audio_block(label="Wet Audio", interactive=False)
     with gr.Row():
-        reset_button = gr.Button("Reset", elem_id="reset-button")
         render_button = gr.Button("Run", elem_id="render-button", variant="primary")
     _ = gr.Markdown("## Common Parameters")
     with gr.Row():
@@ -426,18 +465,24 @@ with gr.Blocks() as demo:
             interactive=True,
         )
         embedding_dropdown = gr.Dropdown(
-            ["AFx-Rep", "MFCC", "MIR Features"],
             label="Embedding Model",
             info="This parameter is used in the Nearest Neighbour and ST-ITO methods.",
-            value="AFx-Rep",
-            interactive=True,
-        )
-        remove_approx_checkbox = gr.Checkbox(
-            label="Use Real-time Effects",
-            info="Use real-time delay and reverb effects instead of approximated ones.",
-            value=False,
             interactive=True,
         )
     _ = gr.Markdown("## Parameters for ST-ITO Method")
     with gr.Row():
@@ -492,11 +537,13 @@ with gr.Blocks() as demo:
         ),
         inputs=[
             audio_input,
             dry_wet_ratio,
             method_dropdown,
             dataset_dropdown,
             embedding_dropdown,
             remove_approx_checkbox,
             optimisation_steps,
             prior_weight,
             optimiser_dropdown,

 from torchcomp import coef2ms, ms2coef
 from copy import deepcopy
 from pathlib import Path
+from typing import Tuple, List, Optional, Union, Callable
 from modules.utils import vec2statedict, get_chunks
+from modules.fx import clip_delay_eq_Q, hadamard
+from utils import get_log_mags_from_eq, chain_functions
+from ito import find_closest_training_sample
+from st_ito.utils import (
+    load_param_model,
+    get_param_embeds,
+    get_feature_embeds,
+    load_mfcc_feature_extractor,
+    load_mir_feature_extractor,
+)
 title_md = "# Vocal Effects Style Transfer Demo"
 # global_fx.eval()
 global_fx.load_state_dict(vec2dict(internal_mean), strict=False)
+ndim_dict = {k: v.ndim for k, v in global_fx.state_dict().items()}
+to_fx_state_dict = lambda x: {
+    k: v[0] if ndim_dict[k] == 0 else v for k, v in vec2dict(x).items()
+}
 meter = pyln.Meter(44100)
+def get_embedding_model(embedding: str) -> Callable:
+    match embedding:
+        case "afx-rep":
+            afx_rep = load_param_model()
+            two_chs_emb_fn = lambda x: get_param_embeds(x, afx_rep, 44100)
+        case "mfcc":
+            mfcc = load_mfcc_feature_extractor()
+            two_chs_emb_fn = lambda x: get_feature_embeds(x, mfcc)
+        case "mir":
+            mir = load_mir_feature_extractor()
+            two_chs_emb_fn = lambda x: get_feature_embeds(x, mir)
+        case _:
+            raise ValueError(f"Unknown encoder: {embedding}")
+    return two_chs_emb_fn
+def convert2float(sr: int, x: np.ndarray) -> np.ndarray:
+    if sr != 44100:
+        x = resample(x, sr, 44100)
+    if x.dtype.kind != "f":
+        x = x / 32768.0
+    if x.ndim == 1:
+        x = x[:, None]
+    return x
 def inference(
+    input_audio,
+    ref_audio,
     ratio,
     method,
     dataset,
     embedding,
     remove_approx,
+    mid_side,
     steps,
     prior_weight,
     optimiser,
     lr,
 ):
+    y = convert2float(*input_audio)
+    ref = convert2float(*ref_audio)
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
     y = torch.from_numpy(y).float().T.unsqueeze(0)
+    ref_loudness = meter.integrated_loudness(ref)
+    ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
+    ref = torch.from_numpy(ref).float().T.unsqueeze(0)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
     match method:
         case "Mean":
             vec = gaussian_params_dict[dataset][0]
+        case "Nearest Neighbour":
+            two_chs_emb_fn = chain_functions(
+                hadamard if mid_side else lambda x: x,
+                get_embedding_model(embedding),
+            )
+            vec = find_closest_training_sample(
+                fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
+            )
         case _:
+            raise ValueError(f"Unknown method: {method}")
     if remove_approx:
         infer_fx = instantiate(rt_config)
             wet_output = default_audio_block(label="Wet Audio", interactive=False)
     with gr.Row():
         render_button = gr.Button("Run", elem_id="render-button", variant="primary")
+        reset_button = gr.Button("Reset", elem_id="reset-button")
     _ = gr.Markdown("## Common Parameters")
     with gr.Row():
             interactive=True,
         )
         embedding_dropdown = gr.Dropdown(
+            [("AFx-Rep", "afx-rep"), ("MFCC", "mfcc"), ("MIR Features", "mir")],
             label="Embedding Model",
             info="This parameter is used in the Nearest Neighbour and ST-ITO methods.",
+            value="afx-rep",
             interactive=True,
         )
+        with gr.Column():
+            remove_approx_checkbox = gr.Checkbox(
+                label="Use Real-time Effects",
+                info="Use real-time delay and reverb effects instead of approximated ones.",
+                value=False,
+                interactive=True,
+            )
+            mid_side_checkbox = gr.Checkbox(
+                label="Use Mid-Side Processing",
+                value=True,
+                interactive=True,
+            )
     _ = gr.Markdown("## Parameters for ST-ITO Method")
     with gr.Row():
         ),
         inputs=[
             audio_input,
+            audio_reference,
             dry_wet_ratio,
             method_dropdown,
             dataset_dropdown,
             embedding_dropdown,
             remove_approx_checkbox,
+            mid_side_checkbox,
             optimisation_steps,
             prior_weight,
             optimiser_dropdown,