Spaces:

yoyolicoris
/

diffvox-ito

Running on Zero

App Files Files Community

yoyolicoris commited on Jul 8, 2025

Commit

90573cb

1 Parent(s): a59e3f3

feat: enable cuda device option

Browse files

Files changed (1) hide show

app.py +41 -15

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from typing import Tuple, List, Optional, Union, Callable
 from modules.utils import vec2statedict, get_chunks
 from modules.fx import clip_delay_eq_Q, hadamard
 from utils import get_log_mags_from_eq, chain_functions
-from ito import find_closest_training_sample
 from st_ito.utils import (
     load_param_model,
     get_param_embeds,
@@ -47,6 +47,7 @@ Try to play around with the sliders and buttons and see what you can come up wit
 > **_Note:_** To upload your own audio, click X on the top right corner of the input audio block.
 """
 SLIDER_MAX = 3
 SLIDER_MIN = -3
 NUMBER_OF_PCS = 4
@@ -88,11 +89,18 @@ def load_presets(preset_folder: Path) -> Tensor:
     return presets
-def load_gaussian_params(f: Union[Path, str]) -> Tuple[Tensor, Tensor]:
     gauss_params = np.load(f)
     mean = torch.from_numpy(gauss_params["mean"]).float()
     cov = torch.from_numpy(gauss_params["cov"]).float()
-    return mean, cov
 preset_dict = {k: load_presets(v) for k, v in PRESET_PATH.items()}
@@ -146,13 +154,13 @@ meter = pyln.Meter(44100)
 def get_embedding_model(embedding: str) -> Callable:
     match embedding:
         case "afx-rep":
-            afx_rep = load_param_model()
             two_chs_emb_fn = lambda x: get_param_embeds(x, afx_rep, 44100)
         case "mfcc":
-            mfcc = load_mfcc_feature_extractor()
             two_chs_emb_fn = lambda x: get_feature_embeds(x, mfcc)
         case "mir":
-            mir = load_mir_feature_extractor()
             two_chs_emb_fn = lambda x: get_feature_embeds(x, mir)
         case _:
             raise ValueError(f"Unknown encoder: {embedding}")
@@ -188,34 +196,52 @@ def inference(
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
-    y = torch.from_numpy(y).float().T.unsqueeze(0)
     ref_loudness = meter.integrated_loudness(ref)
     ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
-    ref = torch.from_numpy(ref).float().T.unsqueeze(0)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
-    fx = deepcopy(global_fx)
     fx.train()
     match method:
         case "Mean":
             vec = gaussian_params_dict[dataset][0]
-        case "Nearest Neighbour":
             two_chs_emb_fn = chain_functions(
                 hadamard if mid_side else lambda x: x,
                 get_embedding_model(embedding),
             )
-            vec = find_closest_training_sample(
-                fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
             )
         case _:
             raise ValueError(f"Unknown method: {method}")
     if remove_approx:
-        infer_fx = instantiate(rt_config)
     else:
         infer_fx = fx
@@ -225,8 +251,8 @@ def inference(
     with torch.no_grad():
         direct, wet = fx(y)
-    direct = direct.squeeze(0).T.numpy()
-    wet = wet.squeeze(0).T.numpy()
     angle = ratio * math.pi * 0.5
     test_clipping = direct + wet
     # rendered = fx(y).squeeze(0).T.numpy()

 from modules.utils import vec2statedict, get_chunks
 from modules.fx import clip_delay_eq_Q, hadamard
 from utils import get_log_mags_from_eq, chain_functions
+from ito import find_closest_training_sample, one_evaluation
 from st_ito.utils import (
     load_param_model,
     get_param_embeds,
 > **_Note:_** To upload your own audio, click X on the top right corner of the input audio block.
 """
+DEVICE = "cuda"
 SLIDER_MAX = 3
 SLIDER_MIN = -3
 NUMBER_OF_PCS = 4
     return presets
+def load_gaussian_params(f: Union[Path, str]) -> Tuple[Tensor, Tensor, Tensor]:
     gauss_params = np.load(f)
     mean = torch.from_numpy(gauss_params["mean"]).float()
     cov = torch.from_numpy(gauss_params["cov"]).float()
+    return mean, cov, cov.logdet()
+def logp_x(mu, cov, cov_logdet, x):
+    diff = x - mu
+    b = torch.linalg.solve(cov, diff)
+    norm = diff @ b
+    return -0.5 * (norm + cov_logdet + mu.shape[0] * math.log(2 * math.pi))
 preset_dict = {k: load_presets(v) for k, v in PRESET_PATH.items()}
 def get_embedding_model(embedding: str) -> Callable:
     match embedding:
         case "afx-rep":
+            afx_rep = load_param_model().to(DEVICE)
             two_chs_emb_fn = lambda x: get_param_embeds(x, afx_rep, 44100)
         case "mfcc":
+            mfcc = load_mfcc_feature_extractor().to(DEVICE)
             two_chs_emb_fn = lambda x: get_feature_embeds(x, mfcc)
         case "mir":
+            mir = load_mir_feature_extractor().to(DEVICE)
             two_chs_emb_fn = lambda x: get_feature_embeds(x, mir)
         case _:
             raise ValueError(f"Unknown encoder: {embedding}")
     loudness = meter.integrated_loudness(y)
     y = pyln.normalize.loudness(y, loudness, -18.0)
+    y = torch.from_numpy(y).float().T.unsqueeze(0).to(DEVICE)
     ref_loudness = meter.integrated_loudness(ref)
     ref = pyln.normalize.loudness(ref, ref_loudness, -18.0)
+    ref = torch.from_numpy(ref).float().T.unsqueeze(0).to(DEVICE)
     if y.shape[1] != 1:
         y = y.mean(dim=1, keepdim=True)
+    fx = deepcopy(global_fx).to(DEVICE)
     fx.train()
     match method:
         case "Mean":
             vec = gaussian_params_dict[dataset][0]
+        case "Nearest Neighbour" | "ST-ITO":
             two_chs_emb_fn = chain_functions(
                 hadamard if mid_side else lambda x: x,
                 get_embedding_model(embedding),
             )
+            vec = (
+                find_closest_training_sample(
+                    fx, two_chs_emb_fn, to_fx_state_dict, preset_dict[dataset], ref, y
+                )
+                if method == "Nearest Neighbour"
+                else one_evaluation(
+                    fx,
+                    two_chs_emb_fn,
+                    to_fx_state_dict,
+                    partial(
+                        logp_x, *[x.to(DEVICE) for x in gaussian_params_dict[dataset]]
+                    ),
+                    internal_mean.to(DEVICE),
+                    ref,
+                    y,
+                    optimiser_type=optimiser,
+                    lr=lr,
+                    steps=steps,
+                    weight=prior_weight,
+                )
             )
         case _:
             raise ValueError(f"Unknown method: {method}")
     if remove_approx:
+        infer_fx = instantiate(rt_config).to(DEVICE)
     else:
         infer_fx = fx
     with torch.no_grad():
         direct, wet = fx(y)
+    direct = direct.squeeze(0).T.cpu().numpy()
+    wet = wet.squeeze(0).T.cpu().numpy()
     angle = ratio * math.pi * 0.5
     test_clipping = direct + wet
     # rendered = fx(y).squeeze(0).T.numpy()