Spaces:

yoyolicoris
/

diffvox-ito

Running on Zero

App Files Files Community

yoyolicoris commited on Jul 9, 2025

Commit

6bd65db

1 Parent(s): 6fb8ca1

update app.py

Browse files

Files changed (1) hide show

app.py +62 -50

app.py CHANGED Viewed

@@ -47,7 +47,7 @@ Try to play around with the sliders and buttons and see what you can come up wit
 > **_Note:_** To upload your own audio, click X on the top right corner of the input audio block.
 """
-DEVICE = "cuda"
 SLIDER_MAX = 3
 SLIDER_MIN = -3
 NUMBER_OF_PCS = 4
@@ -180,77 +180,82 @@ def convert2float(sr: int, x: np.ndarray) -> np.ndarray:
 def inference(
     input_audio,
     ref_audio,
-    ratio,
     method,
     dataset,
     embedding,
-    remove_approx,
     mid_side,
     steps,
     prior_weight,
     optimiser,
     lr,
 ):
-    y = convert2float(*input_audio)
     ref = convert2float(*ref_audio)
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
     y = torch.from_numpy(y).float().T.unsqueeze(0).to(DEVICE)
-    ref_loudness = meter.integrated_loudness(ref)
-    ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
-    ref = torch.from_numpy(ref).float().T.unsqueeze(0).to(DEVICE)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
     fx = deepcopy(global_fx).to(DEVICE)
     fx.train()
     match method:
-        case "Mean":
-            vec = gaussian_params_dict[dataset][0]
-        case "Nearest Neighbour" | "ST-ITO":
-            two_chs_emb_fn = chain_functions(
-                hadamard if mid_side else lambda x: x,
-                get_embedding_model(embedding),
             )
-            vec = (
-                find_closest_training_sample(
-                    fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
-                )
-                if method == "Nearest Neighbour"
-                else one_evaluation(
-                    fx,
-                    two_chs_emb_fn,
-                    to_fx_state_dict,
-                    partial(
-                        logp_x, *[x.to(DEVICE) for x in gaussian_params_dict[dataset]]
-                    ),
-                    internal_mean.to(DEVICE),
-                    ref,
-                    y,
-                    optimiser_type=optimiser,
-                    lr=lr,
-                    steps=steps,
-                    weight=prior_weight,
-                )
             )
         case _:
             raise ValueError(f"Unknown method: {method}")
     if remove_approx:
         infer_fx = instantiate(rt_config).to(DEVICE)
     else:
-        infer_fx = fx
     infer_fx.load_state_dict(vec2dict(vec), strict=False)
     # fx.apply(partial(clip_delay_eq_Q, Q=0.707))
     infer_fx.eval()
     with torch.no_grad():
-        direct, wet = fx(y)
     direct = direct.squeeze(0).T.cpu().numpy()
     wet = wet.squeeze(0).T.cpu().numpy()
     angle = ratio * math.pi * 0.5
@@ -424,7 +429,7 @@ def vec2fx(x):
 with gr.Blocks() as demo:
-    # fx_params = gr.State(internal_mean)
     # fx = vec2fx(fx_params.value)
     # sr, y = read(EXAMPLE_PATH)
@@ -514,15 +519,16 @@ with gr.Blocks() as demo:
     with gr.Row():
         optimisation_steps = gr.Slider(
             minimum=1,
-            maximum=10000,
-            value=1000,
             label="Number of Optimisation Steps",
             interactive=True,
         )
-        prior_weight = gr.Slider(
-            minimum=0.0,
-            maximum=1.0,
-            value=0.1,
             label="Prior Weight",
             interactive=True,
         )
@@ -544,10 +550,9 @@ with gr.Blocks() as demo:
             label="Optimiser",
             interactive=True,
         )
-        lr_slider = gr.Slider(
-            minimum=1e-6,
-            maximum=1.0,
-            value=1e-3,
             label="Learning Rate",
             interactive=True,
         )
@@ -559,16 +564,22 @@ with gr.Blocks() as demo:
             #     ratio,
             #     # assign_fx_params(vec2fx(x), *all_s),
             # ),
-            inference,
         ),
         inputs=[
             audio_input,
-            audio_reference,
             dry_wet_ratio,
             method_dropdown,
             dataset_dropdown,
             embedding_dropdown,
-            remove_approx_checkbox,
             mid_side_checkbox,
             optimisation_steps,
             prior_weight,
@@ -580,6 +591,7 @@ with gr.Blocks() as demo:
             audio_output,
             direct_output,
             wet_output,
         ],
     )

 > **_Note:_** To upload your own audio, click X on the top right corner of the input audio block.
 """
+DEVICE = "cpu"
 SLIDER_MAX = 3
 SLIDER_MIN = -3
 NUMBER_OF_PCS = 4
 def inference(
     input_audio,
     ref_audio,
     method,
     dataset,
     embedding,
     mid_side,
     steps,
     prior_weight,
     optimiser,
     lr,
 ):
+    if method == "Mean":
+        return gaussian_params_dict[dataset][0].to(DEVICE)
     ref = convert2float(*ref_audio)
+    ref_loudness = meter.integrated_loudness(ref)
+    ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
+    ref = torch.from_numpy(ref).float().T.unsqueeze(0).to(DEVICE)
+    y = convert2float(*input_audio)
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
     y = torch.from_numpy(y).float().T.unsqueeze(0).to(DEVICE)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
     fx = deepcopy(global_fx).to(DEVICE)
     fx.train()
+    two_chs_emb_fn = chain_functions(
+        hadamard if mid_side else lambda x: x,
+        get_embedding_model(embedding),
+    )
     match method:
+        case "Nearest Neighbour":
+            vec = find_closest_training_sample(
+                fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
             )
+        case "ST-ITO":
+            vec = one_evaluation(
+                fx,
+                two_chs_emb_fn,
+                to_fx_state_dict,
+                partial(logp_x, *[x.to(DEVICE) for x in gaussian_params_dict[dataset]]),
+                internal_mean.to(DEVICE),
+                ref,
+                y,
+                optimiser_type=optimiser,
+                lr=lr,
+                steps=steps,
+                weight=prior_weight,
             )
         case _:
             raise ValueError(f"Unknown method: {method}")
+    return vec
+def render(y, remove_approx, ratio, vec):
+    y = convert2float(*y)
+    loudness = meter.integrated_loudness(y)
+    y = pyln.normalize.loudness(y, loudness, -18.0)
+    y = torch.from_numpy(y).float().T.unsqueeze(0).to(DEVICE)
     if remove_approx:
         infer_fx = instantiate(rt_config).to(DEVICE)
     else:
+        infer_fx = instantiate(fx_config).to(DEVICE)
     infer_fx.load_state_dict(vec2dict(vec), strict=False)
     # fx.apply(partial(clip_delay_eq_Q, Q=0.707))
     infer_fx.eval()
     with torch.no_grad():
+        direct, wet = infer_fx(y)
     direct = direct.squeeze(0).T.cpu().numpy()
     wet = wet.squeeze(0).T.cpu().numpy()
     angle = ratio * math.pi * 0.5
 with gr.Blocks() as demo:
+    fx_params = gr.State(internal_mean)
     # fx = vec2fx(fx_params.value)
     # sr, y = read(EXAMPLE_PATH)
     with gr.Row():
         optimisation_steps = gr.Slider(
             minimum=1,
+            maximum=100,
+            value=100,
+            step=1,
             label="Number of Optimisation Steps",
             interactive=True,
         )
+        prior_weight = gr.Dropdown(
+            [("0", 0.0), ("0.001", 0.001), ("0.01", 0.01), ("0.1", 0.1), ("1", 1.0)],
+            info="Weight of the prior distribution in the loss function. A higher value means the model will try to stay closer to the prior distribution.",
+            value=0.01,
             label="Prior Weight",
             interactive=True,
         )
             label="Optimiser",
             interactive=True,
         )
+        lr_slider = gr.Dropdown(
+            [("0.0001", 1e-4), ("0.001", 1e-3), ("0.01", 1e-2), ("0.1", 1e-1)],
+            value=1e-2,
             label="Learning Rate",
             interactive=True,
         )
             #     ratio,
             #     # assign_fx_params(vec2fx(x), *all_s),
             # ),
+            lambda audio, approx, ratio, *args: (
+                audio,
+                approx,
+                ratio,
+                inference(audio, *args),
+            ),
+            lambda audio, approx, ratio, vec: (*render(audio, approx, ratio, vec), vec),
         ),
         inputs=[
             audio_input,
+            remove_approx_checkbox,
             dry_wet_ratio,
+            audio_reference,
             method_dropdown,
             dataset_dropdown,
             embedding_dropdown,
             mid_side_checkbox,
             optimisation_steps,
             prior_weight,
             audio_output,
             direct_output,
             wet_output,
+            fx_params,
         ],
     )