SourceSeparation / sepformer-customdataset.yaml
nachi1326's picture
Upload 5 files
f7942b3 verified
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
# Dataset : Custom dataset
# ################################
#
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
#
seed: 1234
__set_seed: !apply:torch.manual_seed [!ref <seed>]
# Data params
# e.g. '/yourpath/wsj0-mix/2speakers'
# end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
data_folder: !PLACEHOLDER
# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
# e.g. /yourpath/wsj0-processed/si_tr_s/
# you need to convert the original wsj0 to 8k
# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
base_folder_dm: /yourpath/wsj0-processed/si_tr_s/
experiment_name: sepformer-custom
output_folder: !ref results/<experiment_name>/<seed>
train_log: !ref <output_folder>/train_log.txt
save_folder: !ref <output_folder>/save
train_data: !ref <save_folder>/custom_train.csv
valid_data: !ref <save_folder>/custom_valid.csv
test_data: !ref <save_folder>/custom_test.csv
skip_prep: False
# Experiment params
precision: fp32 # bf16, fp16 or fp32
num_spks: 2 # set to 3 for wsj0-3mix
noprogressbar: False
save_audio: True # Save estimated sources on disk
sample_rate: 16000
####################### Training Parameters ####################################
N_epochs: 3
batch_size: 1
lr: 0.00015
clip_grad_norm: 5
loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: False
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000
# Set it to True to dynamically create mixtures at training time
dynamic_mixing: False
# Parameters for data augmentation
use_wavedrop: False
use_speedperturb: False
use_rand_shift: False
min_shift: -8000
max_shift: 8000
# Speed perturbation
speed_changes: [95, 100, 105] # List of speed changes for time-stretching
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: !ref <speed_changes>
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0 # Min frequency band dropout probability
drop_freq_high: 1 # Max frequency band dropout probability
drop_freq_count_low: 1 # Min number of frequency bands to drop
drop_freq_count_high: 3 # Max number of frequency bands to drop
drop_freq_width: 0.05 # Width of frequency bands to drop
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: !ref <drop_freq_low>
drop_freq_high: !ref <drop_freq_high>
drop_freq_count_low: !ref <drop_freq_count_low>
drop_freq_count_high: !ref <drop_freq_count_high>
drop_freq_width: !ref <drop_freq_width>
# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1 # Min number of audio chunks to drop
drop_chunk_count_high: 5 # Max number of audio chunks to drop
drop_chunk_length_low: 1000 # Min length of audio chunks to drop
drop_chunk_length_high: 2000 # Max length of audio chunks to drop
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: !ref <drop_chunk_length_low>
drop_length_high: !ref <drop_chunk_length_high>
drop_count_low: !ref <drop_chunk_count_low>
drop_count_high: !ref <drop_chunk_count_high>
# loss thresholding -- this thresholds the training loss
threshold_byloss: True
threshold: -30
# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8
# Dataloader options
# Set num_workers: 0 on MacOS due to behavior of the multiprocessing library
dataloader_opts:
batch_size: !ref <batch_size>
num_workers: 3
# Specifying the network
Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
kernel_size: !ref <kernel_size>
out_channels: !ref <N_encoder_out>
SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
num_layers: 4
d_model: !ref <out_channels>
nhead: 8
d_ffn: 1024
dropout: 0
use_positional_encoding: True
norm_before: True
SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
num_layers: 4
d_model: !ref <out_channels>
nhead: 8
d_ffn: 1024
dropout: 0
use_positional_encoding: True
norm_before: True
MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
num_spks: !ref <num_spks>
in_channels: !ref <N_encoder_out>
out_channels: !ref <out_channels>
num_layers: 1
K: 250
intra_model: !ref <SBtfintra>
inter_model: !ref <SBtfinter>
norm: ln
linear_layer_after_inter_intra: False
skip_around_intra: True
Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
in_channels: !ref <N_encoder_out>
out_channels: 1
kernel_size: !ref <kernel_size>
stride: !ref <kernel_stride>
bias: False
optimizer: !name:torch.optim.Adam
lr: !ref <lr>
weight_decay: 0
loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
factor: 0.5
patience: 2
dont_halve_until_epoch: 85
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <N_epochs>
modules:
encoder: !ref <Encoder>
decoder: !ref <Decoder>
masknet: !ref <MaskNet>
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
encoder: !ref <Encoder>
decoder: !ref <Decoder>
masknet: !ref <MaskNet>
counter: !ref <epoch_counter>
lr_scheduler: !ref <lr_scheduler>
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>