fakufaku
/

diffsep

Model card Files Files and versions

xet

Community

fakufaku commited on Mar 15, 2024

Commit

539d871

1 Parent(s): e769105

Diffsep model

Browse files

Files changed (3) hide show

README.md +26 -0
checkpoint.pt +3 -0
hparams.yaml +122 -0

README.md CHANGED Viewed

@@ -1,3 +1,29 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+Diffusion-based Generative Speech Source Separation
+This repository contains the checkpoints for the diffusion based speech
+separation model from the paper Diffusion-based Generative Speech Source
+Separation presented at ICASSP 2023.
+The code to run the model is available on [github](https://github.com/fakufaku/diffusion-separation).
+### Abstract
+We propose DiffSep, a new single channel source separation method based on
+score-matching of a stochastic differential equation (SDE). We craft a tailored
+continuous time diffusion-mixing process starting from the separated sources
+and converging to a Gaussian distribution centered on their mixture. This
+formulation lets us apply the machinery of score-based generative modelling.
+First, we train a neural network to approximate the score function of the
+marginal probabilities or the diffusion-mixing process. Then, we use it to
+solve the reverse time SDE that progressively separates the sources starting
+from their mixture. We propose a modified training strategy to handle model
+mismatch and source permutation ambiguity. Experiments on the WSJ0 2mix dataset
+demonstrate the potential of the method. Furthermore, the method is also
+suitable for speech enhancement and shows performance competitive with prior
+work on the VoiceBank-DEMAND dataset.
+ID: `2022-10-23_01-37-07_experiment-model-large-multigpu_model.optimizer.lr-0.0002_model.sde.d_lambda-2.0_model.sde.sigma_min-0.05_epoch-979_si_sdr-11.271_N-30_snr-0.5_corrstep-1_denoise-True_schedule-None`

checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66227d10f97b7884b9eb3c0f27bb579d98326147e37401b6917e2b51ea8aa39d
+size 1313509474

hparams.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+config:
+  seed: 64736289
+  name: default
+  train: true
+  test: false
+  path:
+    exp_root: exp
+    datasets:
+      wsj0_mix: data/wsj0_mix
+    figures: figures
+  datamodule:
+    train:
+      dl_opts:
+        num_workers: 8
+        shuffle: true
+        batch_size: 6
+      dataset:
+        _target_: datasets.WSJ0_mix
+        path: data/wsj0_mix
+        n_spkr: 2
+        fs: 8000
+        cut: max
+        split: train
+        max_len_s: 5
+        max_n_samples: null
+    val:
+      dl_opts:
+        num_workers: 8
+        shuffle: false
+        batch_size: 5
+      dataset:
+        _target_: datasets.WSJ0_mix
+        path: data/wsj0_mix
+        n_spkr: 2
+        fs: 8000
+        cut: max
+        split: val
+        max_len_s: null
+        max_n_samples: null
+    test:
+      dl_opts:
+        num_workers: 8
+        shuffle: false
+        batch_size: 5
+      dataset:
+        _target_: datasets.WSJ0_mix
+        path: data/wsj0_mix
+        n_spkr: 2
+        fs: 8000
+        cut: max
+        split: test
+        max_len_s: null
+        max_n_samples: null
+  model:
+    n_speakers: 2
+    fs: 8000
+    t_eps: 0.03
+    t_rev_init: 0.03
+    ema_decay: 0.999
+    valid_max_sep_batches: 2
+    time_sampling_strategy: uniform
+    train_source_order: power
+    init_hack: 5
+    mmnr_thresh_pit: -10.0
+    score_model:
+      _target_: models.score_models.ScoreModelNCSNpp
+      num_sources: 2
+      stft_args:
+        n_fft: 510
+        hop_length: 128
+        center: true
+        pad_mode: constant
+      backbone_args:
+        _target_: models.ncsnpp.NCSNpp
+        nf: 128
+      transform: exponent
+      spec_abs_exponent: 0.5
+      spec_factor: 0.15
+    sde:
+      _target_: sdes.sdes.MixSDE
+      ndim: 2
+      d_lambda: 2.0
+      sigma_min: 0.05
+      sigma_max: 0.5
+      'N': 30
+    sampler:
+      'N': 30
+      snr: 0.5
+      corrector_steps: 1
+    loss:
+      _target_: torch.nn.MSELoss
+    main_val_loss: val/si_sdr
+    main_val_loss_mode: max
+    val_losses:
+      val/si_sdr:
+        _target_: models.losses.SISDRLoss
+        zero_mean: true
+        clamp_db: 30
+        reduction: mean
+        sign_flip: true
+    optimizer:
+      _target_: torch.optim.Adam
+      lr: 0.0002
+      weight_decay: 0.0
+    scheduler: null
+    grad_clipper:
+      _target_: utils.FixedClipper
+      max_norm: 5.0
+    init_hack_p: 0.1
+  trainer:
+    _target_: pytorch_lightning.Trainer
+    accumulate_grad_batches: 2
+    min_epochs: 1
+    max_epochs: 1000
+    deterministic: true
+    accelerator: gpu
+    devices: -1
+    strategy: ddp
+    auto_select_gpus: true
+    check_val_every_n_epoch: 5
+    default_root_dir: .
+    profiler: false