|
|
name: score_based_generative_model |
|
|
|
|
|
model: |
|
|
type: score_based |
|
|
sample_rate: 16000 |
|
|
skip_nan_grad: false |
|
|
num_outputs: 1 |
|
|
normalize_input: true |
|
|
max_utts_evaluation_metrics: 50 |
|
|
|
|
|
train_ds: |
|
|
manifest_filepath: ??? |
|
|
input_key: noisy_filepath |
|
|
target_key: clean_filepath |
|
|
audio_duration: 2.04 |
|
|
random_offset: true |
|
|
normalization_signal: input_signal |
|
|
batch_size: 8 |
|
|
shuffle: true |
|
|
num_workers: 8 |
|
|
pin_memory: true |
|
|
|
|
|
validation_ds: |
|
|
manifest_filepath: ??? |
|
|
input_key: noisy_filepath |
|
|
target_key: clean_filepath |
|
|
normalize_input: false |
|
|
batch_size: 4 |
|
|
shuffle: false |
|
|
num_workers: 4 |
|
|
pin_memory: true |
|
|
|
|
|
encoder: |
|
|
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram |
|
|
fft_length: 510 |
|
|
hop_length: 128 |
|
|
magnitude_power: 0.5 |
|
|
scale: 0.33 |
|
|
|
|
|
decoder: |
|
|
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio |
|
|
fft_length: ${model.encoder.fft_length} |
|
|
hop_length: ${model.encoder.hop_length} |
|
|
magnitude_power: ${model.encoder.magnitude_power} |
|
|
scale: ${model.encoder.scale} |
|
|
|
|
|
estimator: |
|
|
_target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus |
|
|
in_channels: 2 |
|
|
out_channels: 1 |
|
|
conditioned_on_time: true |
|
|
num_res_blocks: 3 |
|
|
pad_time_to: 64 |
|
|
pad_dimension_to: 0 |
|
|
|
|
|
sde: |
|
|
_target_: nemo.collections.audio.parts.submodules.diffusion.OrnsteinUhlenbeckVarianceExplodingSDE |
|
|
stiffness: 1.5 |
|
|
std_min: 0.05 |
|
|
std_max: 0.5 |
|
|
num_steps: 1000 |
|
|
|
|
|
sampler: |
|
|
_target_: nemo.collections.audio.parts.submodules.diffusion.PredictorCorrectorSampler |
|
|
predictor: reverse_diffusion |
|
|
corrector: annealed_langevin_dynamics |
|
|
num_steps: 50 |
|
|
num_corrector_steps: 1 |
|
|
snr: 0.5 |
|
|
|
|
|
loss: |
|
|
_target_: nemo.collections.audio.losses.MSELoss |
|
|
ndim: 4 |
|
|
|
|
|
metrics: |
|
|
val: |
|
|
sisdr: |
|
|
_target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio |
|
|
|
|
|
optim: |
|
|
name: adam |
|
|
lr: 1e-4 |
|
|
|
|
|
betas: [0.9, 0.999] |
|
|
weight_decay: 0.0 |
|
|
|
|
|
trainer: |
|
|
devices: -1 |
|
|
num_nodes: 1 |
|
|
max_epochs: -1 |
|
|
max_steps: -1 |
|
|
val_check_interval: 1.0 |
|
|
accelerator: auto |
|
|
strategy: ddp |
|
|
accumulate_grad_batches: 1 |
|
|
gradient_clip_val: null |
|
|
precision: 32 |
|
|
log_every_n_steps: 25 |
|
|
enable_progress_bar: true |
|
|
num_sanity_val_steps: 0 |
|
|
check_val_every_n_epoch: 1 |
|
|
sync_batchnorm: true |
|
|
enable_checkpointing: false |
|
|
logger: false |
|
|
|
|
|
exp_manager: |
|
|
exp_dir: null |
|
|
name: ${name} |
|
|
|
|
|
|
|
|
ema: |
|
|
enable: true |
|
|
decay: 0.999 |
|
|
cpu_offload: false |
|
|
every_n_steps: 1 |
|
|
validate_original_weights: false |
|
|
|
|
|
|
|
|
create_tensorboard_logger: true |
|
|
|
|
|
|
|
|
create_checkpoint_callback: true |
|
|
checkpoint_callback_params: |
|
|
|
|
|
monitor: val_sisdr |
|
|
mode: max |
|
|
save_top_k: 5 |
|
|
always_save_nemo: true |
|
|
|
|
|
|
|
|
create_early_stopping_callback: true |
|
|
early_stopping_callback_params: |
|
|
monitor: val_sisdr |
|
|
mode: max |
|
|
min_delta: 0.0 |
|
|
patience: 20 |
|
|
verbose: true |
|
|
strict: false |
|
|
|
|
|
resume_from_checkpoint: null |
|
|
|
|
|
resume_if_exists: false |
|
|
resume_ignore_no_checkpoint: false |
|
|
|
|
|
|
|
|
create_wandb_logger: false |
|
|
wandb_logger_kwargs: |
|
|
name: null |
|
|
project: null |
|
|
|