Spaces:

yoyolicoris
/

diffvox-ito

Running on Zero

App Files Files Community

yoyolicoris commited on Jul 8, 2025

Commit

df0ae2d

1 Parent(s): 40b18c2

add regression model checkpoints and necessary dependencies

Browse files

Files changed (5) hide show

ltng/regression.py +106 -0
modules/encoder.py +109 -0
reg-ckpts/checkpoints/epoch=99-step=6500-val_loss=0.842.ckpt +3 -0
reg-ckpts/config.yaml +328 -0
reg-ckpts/param_stats.pt +3 -0

ltng/regression.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+import lightning.pytorch as pl
+from typing import Tuple, List, Optional
+class ParamPrediction(pl.LightningModule):
+    def __init__(
+        self,
+        predictor: nn.Module,
+        condition: str = "wet",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.predictor = predictor
+        self.condition = condition
+    def forward(
+        self,
+        wet: Optional[torch.Tensor] = None,
+        dry: Optional[torch.Tensor] = None,
+    ):
+        match self.condition:
+            case "wet":
+                return self.predictor(wet)
+            case "dry":
+                return self.predictor(dry)
+            case "both":
+                return self.predictor(wet, dry)
+            case _:
+                raise ValueError(f"Invalid condition: {self.condition}")
+    def training_step(self, batch, batch_idx):
+        x, cond, dry, rel_path = batch
+        pred = self(cond, dry)
+        loss = F.mse_loss(pred, x)
+        self.log("loss", loss.item(), prog_bar=True, sync_dist=True)
+        return loss
+    def on_validation_epoch_start(self) -> None:
+        self.tmp_val_outputs = []
+    def validation_step(self, batch, batch_idx):
+        x, cond, dry, *_ = batch
+        pred = self(cond, dry)
+        loss = F.mse_loss(pred, x)
+        values = {
+            "loss": loss.item(),
+            "N": x.shape[0],
+        }
+        self.tmp_val_outputs.append(values)
+        return loss
+    def on_validation_epoch_end(self) -> None:
+        outputs = self.tmp_val_outputs
+        weights = [x["N"] for x in outputs]
+        avg_loss = np.average([x["loss"] for x in outputs], weights=weights)
+        self.log_dict(
+            {
+                "val_loss": avg_loss,
+            },
+            prog_bar=True,
+            sync_dist=True,
+        )
+        delattr(self, "tmp_val_outputs")
+    def on_test_epoch_start(self) -> None:
+        self.tmp_test_outputs = []
+    def test_step(self, batch, batch_idx):
+        x, cond, dry, *_ = batch
+        pred = self(cond, dry)
+        loss = F.mse_loss(pred, x)
+        values = {
+            "loss": loss.item(),
+            "N": x.shape[0],
+        }
+        self.tmp_test_outputs.append(values)
+        return loss
+    def on_test_epoch_end(self) -> None:
+        outputs = self.tmp_test_outputs
+        weights = [x["N"] for x in outputs]
+        avg_loss = np.average([x["loss"] for x in outputs], weights=weights)
+        self.log_dict(
+            {
+                "test_loss": avg_loss,
+            },
+            prog_bar=True,
+            sync_dist=True,
+        )
+        delattr(self, "tmp_test_outputs")

modules/encoder.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from functools import partial, reduce
+from typing import Optional, List
+from .utils import chain_functions
+class LogSpectralCentroid(nn.Module):
+    def forward(self, spec):
+        # assume spec is of shape (..., freq, time)
+        freqs = torch.linspace(0, 1, spec.size(-2), device=spec.device)
+        spec_T = spec.transpose(-1, -2)
+        normalised_spec = spec_T / spec_T.sum(-1, keepdim=True).clamp_min(1e-8)
+        return torch.log(normalised_spec @ freqs + 1e-8).unsqueeze(-2)
+class LogSpectralFlatness(nn.Module):
+    def forward(self, spec):
+        # assume spec is of shape (..., freq, time)
+        spec_pow = spec.clamp(1e-8).square()
+        log_gmean = spec_pow.log().mean(-2, keepdim=True)
+        log_amean = spec_pow.mean(-2, keepdim=True).log()
+        return log_gmean - log_amean
+class LogSpectralBandwidth(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.centroid = LogSpectralCentroid()
+    def forward(self, spec):
+        # assume spec is of shape (..., freq, time)
+        freqs = torch.linspace(0, 1, spec.size(-2), device=spec.device)
+        centroid = self.centroid(spec).exp()
+        normalised_spec = spec / spec.sum(-2, keepdim=True).clamp_min(1e-8)
+        return (
+            torch.log(
+                (normalised_spec * (freqs[:, None] - centroid).square()).sum(
+                    -2, keepdim=True
+                )
+                + 1e-8
+            )
+            * 0.5
+        )
+class LogRMS(nn.Module):
+    def forward(self, frame):
+        return torch.log(frame.square().mean(-2, keepdim=True).sqrt() + 1e-8)
+class LogCrest(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.rms = LogRMS()
+    def forward(self, frame):
+        log_rms = self.rms(frame)
+        return frame.abs().amax(-2, keepdim=True).add(1e-8).log() - log_rms
+class LogSpread(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.rms = LogRMS()
+    def forward(self, frame):
+        log_rms = self.rms(frame)
+        return (frame.abs().add(1e-8).log() - log_rms).mean(-2, keepdim=True)
+class MapAndMerge(nn.Module):
+    def __init__(self, funcs: List[nn.Module], dim=-1):
+        super().__init__()
+        self.funcs = nn.ModuleList(funcs)
+        self.dim = dim
+    def forward(self, frame):
+        return torch.cat([f(frame) for f in self.funcs], dim=self.dim)
+class Frame(nn.Module):
+    def __init__(self, frame_length, hop_length, center=False):
+        super().__init__()
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.center = center
+    def forward(self, waveform):
+        if self.center:
+            waveform = F.pad(waveform, (self.frame_length // 2, self.frame_length // 2))
+        return waveform.unfold(-1, self.frame_length, self.hop_length).transpose(-1, -2)
+class StatisticReduction(nn.Module):
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        mu = x.mean(self.dim, keepdim=True)
+        diffs = x - mu
+        std = diffs.square().mean(self.dim, keepdim=True).sqrt()
+        zscores = diffs / std.clamp_min(1e-8)
+        skews = zscores.pow(3).mean(self.dim, keepdim=True)
+        kurts = zscores.pow(4).mean(self.dim, keepdim=True) - 3
+        return torch.cat([mu, std, skews, kurts], dim=self.dim)

reg-ckpts/checkpoints/epoch=99-step=6500-val_loss=0.842.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3559ca46370e3d00c96498107e81e98119d65c9ee3ecc2bd45e5d92f8b51c9a5
+size 111225779

reg-ckpts/config.yaml ADDED Viewed

	@@ -0,0 +1,328 @@

+# lightning.pytorch==2.4.0
+seed_everything: false
+trainer:
+  accelerator: gpu
+  strategy: auto
+  devices: 1
+  num_nodes: 1
+  precision: null
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: null
+      save_dir: .
+      version: null
+      offline: false
+      dir: null
+      id: null
+      anonymous: null
+      project: vocal-fx-regression
+      log_model: false
+      experiment: null
+      prefix: ''
+      checkpoint_name: null
+      job_type: null
+      config: null
+      entity: null
+      reinit: null
+      tags: null
+      group: null
+      notes: null
+      magic: null
+      config_exclude_keys: null
+      config_include_keys: null
+      mode: null
+      allow_val_change: null
+      resume: null
+      force: null
+      tensorboard: null
+      sync_tensorboard: null
+      monitor_gym: null
+      save_code: null
+      fork_from: null
+      resume_from: null
+      settings: null
+  callbacks:
+  - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      dirpath: null
+      filename: '{epoch}-{step}-{val_loss:.3f}'
+      monitor: val_loss
+      verbose: false
+      save_last: true
+      save_top_k: 3
+      save_weights_only: false
+      mode: min
+      auto_insert_metric_name: true
+      every_n_train_steps: null
+      train_time_interval: null
+      every_n_epochs: 10
+      save_on_train_epoch_end: null
+      enable_version_counter: true
+  fast_dev_run: false
+  max_epochs: null
+  min_epochs: null
+  max_steps: 100000
+  min_steps: null
+  max_time: null
+  limit_train_batches: null
+  limit_val_batches: null
+  limit_test_batches: null
+  limit_predict_batches: null
+  overfit_batches: 0.0
+  val_check_interval: null
+  check_val_every_n_epoch: 10
+  num_sanity_val_steps: 2
+  log_every_n_steps: 1
+  enable_checkpointing: null
+  enable_progress_bar: null
+  enable_model_summary: null
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  gradient_clip_algorithm: null
+  deterministic: null
+  benchmark: null
+  inference_mode: true
+  use_distributed_sampler: true
+  profiler: null
+  detect_anomaly: false
+  barebones: false
+  plugins: null
+  sync_batchnorm: false
+  reload_dataloaders_every_n_epochs: 0
+  default_root_dir: null
+ckpt_path: null
+data:
+  class_path: ltng.aug_data.GenDataModule
+  init_args:
+    train_root: /data2/chin-yun/sub_train
+    batch_size: 64
+    val_root: /data2/chin-yun/sub_val
+    test_root: null
+optimizer:
+  class_path: torch.optim.AdamW
+  init_args:
+    lr: 0.001
+    betas:
+    - 0.9
+    - 0.999
+    eps: 1.0e-08
+    weight_decay: 0.01
+    amsgrad: false
+    maximize: false
+    foreach: null
+    capturable: false
+    differentiable: false
+    fused: null
+model:
+  class_path: ltng.regression.ParamPrediction
+  init_args:
+    predictor:
+      class_path: modules.model.LightningSequential
+      init_args:
+        modules:
+        - class_path: modules.encoder.MapAndMerge
+          init_args:
+            funcs:
+            - class_path: torch.nn.Identity
+            - class_path: modules.fx.Hadamard
+            dim: 1
+        - class_path: modules.encoder.MapAndMerge
+          init_args:
+            funcs:
+            - class_path: modules.model.LightningSequential
+              init_args:
+                modules:
+                - class_path: modules.encoder.Frame
+                  init_args:
+                    frame_length: 1024
+                    hop_length: 256
+                    center: true
+                - class_path: modules.encoder.MapAndMerge
+                  init_args:
+                    funcs:
+                    - class_path: modules.encoder.LogRMS
+                    - class_path: modules.encoder.LogCrest
+                    - class_path: modules.encoder.LogSpread
+                    dim: -2
+            - class_path: modules.model.LogMelSpectrogram
+              init_args:
+                sample_rate: 44100
+                n_fft: 1024
+                win_length: null
+                hop_length: 256
+                f_min: 0.0
+                f_max: null
+                pad: 0
+                n_mels: 80
+                window_fn: torch.hann_window
+                power: 2.0
+                normalized: false
+                wkwargs: null
+                center: true
+                pad_mode: reflect
+                onesided: null
+                norm: null
+                mel_scale: htk
+            dim: -2
+        - class_path: torch.nn.Flatten
+          init_args:
+            start_dim: 1
+            end_dim: -2
+        - class_path: torch.nn.Conv1d
+          init_args:
+            in_channels: 332
+            out_channels: 512
+            kernel_size: 5
+            stride: 1
+            padding: 0
+            dilation: 1
+            groups: 1
+            bias: true
+            padding_mode: zeros
+            device: null
+            dtype: null
+        - class_path: torch.nn.AvgPool1d
+          init_args:
+            kernel_size: 3
+            stride: 3
+            padding: 0
+            ceil_mode: false
+            count_include_pad: true
+        - class_path: torch.nn.BatchNorm1d
+          init_args:
+            num_features: 512
+            eps: 1.0e-05
+            momentum: 0.1
+            affine: true
+            track_running_stats: true
+            device: null
+            dtype: null
+        - class_path: torch.nn.ReLU
+          init_args:
+            inplace: false
+        - class_path: torch.nn.Conv1d
+          init_args:
+            in_channels: 512
+            out_channels: 512
+            kernel_size: 5
+            stride: 1
+            padding: 0
+            dilation: 1
+            groups: 1
+            bias: true
+            padding_mode: zeros
+            device: null
+            dtype: null
+        - class_path: torch.nn.AvgPool1d
+          init_args:
+            kernel_size: 3
+            stride: 3
+            padding: 0
+            ceil_mode: false
+            count_include_pad: true
+        - class_path: torch.nn.BatchNorm1d
+          init_args:
+            num_features: 512
+            eps: 1.0e-05
+            momentum: 0.1
+            affine: true
+            track_running_stats: true
+            device: null
+            dtype: null
+        - class_path: torch.nn.ReLU
+          init_args:
+            inplace: false
+        - class_path: torch.nn.Conv1d
+          init_args:
+            in_channels: 512
+            out_channels: 768
+            kernel_size: 5
+            stride: 1
+            padding: 0
+            dilation: 1
+            groups: 1
+            bias: true
+            padding_mode: zeros
+            device: null
+            dtype: null
+        - class_path: torch.nn.AvgPool1d
+          init_args:
+            kernel_size: 3
+            stride: 3
+            padding: 0
+            ceil_mode: false
+            count_include_pad: true
+        - class_path: torch.nn.BatchNorm1d
+          init_args:
+            num_features: 768
+            eps: 1.0e-05
+            momentum: 0.1
+            affine: true
+            track_running_stats: true
+            device: null
+            dtype: null
+        - class_path: torch.nn.ReLU
+          init_args:
+            inplace: false
+        - class_path: torch.nn.Conv1d
+          init_args:
+            in_channels: 768
+            out_channels: 1024
+            kernel_size: 5
+            stride: 1
+            padding: 0
+            dilation: 1
+            groups: 1
+            bias: true
+            padding_mode: zeros
+            device: null
+            dtype: null
+        - class_path: torch.nn.AvgPool1d
+          init_args:
+            kernel_size: 3
+            stride: 3
+            padding: 0
+            ceil_mode: false
+            count_include_pad: true
+        - class_path: torch.nn.BatchNorm1d
+          init_args:
+            num_features: 1024
+            eps: 1.0e-05
+            momentum: 0.1
+            affine: true
+            track_running_stats: true
+            device: null
+            dtype: null
+        - class_path: torch.nn.ReLU
+          init_args:
+            inplace: false
+        - class_path: torch.nn.Conv1d
+          init_args:
+            in_channels: 1024
+            out_channels: 1024
+            kernel_size: 1
+            stride: 1
+            padding: 0
+            dilation: 1
+            groups: 1
+            bias: true
+            padding_mode: zeros
+            device: null
+            dtype: null
+        - class_path: torch.nn.AdaptiveMaxPool1d
+          init_args:
+            output_size: 1
+            return_indices: false
+        - class_path: torch.nn.Flatten
+          init_args:
+            start_dim: 1
+            end_dim: -1
+        - class_path: torch.nn.Linear
+          init_args:
+            in_features: 1024
+            out_features: 130
+            bias: true
+            device: null
+            dtype: null
+    condition: wet

reg-ckpts/param_stats.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddbef7000cb8d9ac735dfb3ccd6429df0668532c8779ac52774c032fb9058b4e
+size 2480