nachi1326 commited on
Commit
f7942b3
·
verified ·
1 Parent(s): 99acccf

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +65 -0
  2. decoder.pth +3 -0
  3. encoder.pth +3 -0
  4. masknet.pth +3 -0
  5. sepformer-customdataset.yaml +186 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from hyperpyyaml import load_hyperpyyaml
4
+ import yaml
5
+ from speechbrain.inference.separation import SepformerSeparation
6
+
7
+ import sys
8
+ sys.path.append("SOURCESEPARATION")
9
+
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ def separate_audio(mixture):
13
+ # Convert mixture to tensor
14
+ print(mixture)
15
+ encoder_checkpoint = "models/encoder.pth"
16
+ decoder_checkpoint = "models/decoder.pth"
17
+ masknet_checkpoint = "models/masknet.pth"
18
+
19
+ encoder = torch.load(encoder_checkpoint, map_location=device)
20
+ decoder = torch.load(decoder_checkpoint, map_location=device)
21
+ masknet = torch.load(masknet_checkpoint, map_location=device)
22
+
23
+ # Load model
24
+ # Step 2: Load Hyperparameters
25
+
26
+ data_folder = "."
27
+ # hparams_file = "data/yamls/sepformer-customdataset.yaml"
28
+ overrides = f"data_folder: {data_folder}\noutput_folder: "
29
+ hyperparams_file = "yamls/sepformer-customdataset.yaml"
30
+ with open(hyperparams_file, "r") as f:
31
+ hparams = load_hyperpyyaml(f, overrides)
32
+
33
+ hparams['Encoder'].load_state_dict(encoder)
34
+ hparams['Decoder'].load_state_dict(decoder)
35
+ hparams['MaskNet'].load_state_dict(masknet) #
36
+
37
+ separator = SepformerSeparation(
38
+ modules=hparams["modules"],
39
+ hparams=hparams
40
+ )
41
+
42
+ _, mixture = torch.tensor(mixture)
43
+ est_sources = separator.separate_batch(mixture)
44
+
45
+ s1 = est_sources[:, :, 0].cpu()
46
+ s2 = est_sources[:, :, 1].cpu()
47
+ # Return separated sources
48
+ return [(16000,s1), (16000,s2)]
49
+
50
+ # Define the audio input component
51
+ input_audio = gr.Audio(sources=["upload"], waveform_options=dict(waveform_color="#01C6FF"))
52
+
53
+ # Define the audio output components (one for each processed stream)
54
+ output_audio1 = gr.Audio(autoplay=False)
55
+ output_audio2 = gr.Audio(autoplay=False)
56
+
57
+ # Create the Gradio interface
58
+ interface = gr.Interface(
59
+ fn=separate_audio,
60
+ inputs=input_audio,
61
+ outputs=[output_audio1, output_audio2],
62
+ title="Source Separation"
63
+ )
64
+
65
+ interface.launch()
decoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5bb3ff7438b5c524f804865428b3ace9dcbf9e237484ef62423ece9bd7d3cef
3
+ size 17628
encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2b1ba5da43d6d814304a576dccd34df365aaec5e84f15778a1b96a33ab0b4de
3
+ size 17692
masknet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd1eafc33bf985e80d9a054715fb4ff30581dfce8ab69379be3e5479ce8d395b
3
+ size 32028552
sepformer-customdataset.yaml ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: SepFormer for source separation
3
+ # https://arxiv.org/abs/2010.13154
4
+ # Dataset : Custom dataset
5
+ # ################################
6
+ #
7
+ # Basic parameters
8
+ # Seed needs to be set at top of yaml, before objects with parameters are made
9
+ #
10
+ seed: 1234
11
+ __set_seed: !apply:torch.manual_seed [!ref <seed>]
12
+
13
+ # Data params
14
+
15
+ # e.g. '/yourpath/wsj0-mix/2speakers'
16
+ # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
17
+ data_folder: !PLACEHOLDER
18
+
19
+ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
20
+ # e.g. /yourpath/wsj0-processed/si_tr_s/
21
+ # you need to convert the original wsj0 to 8k
22
+ # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
23
+ base_folder_dm: /yourpath/wsj0-processed/si_tr_s/
24
+
25
+ experiment_name: sepformer-custom
26
+ output_folder: !ref results/<experiment_name>/<seed>
27
+ train_log: !ref <output_folder>/train_log.txt
28
+ save_folder: !ref <output_folder>/save
29
+ train_data: !ref <save_folder>/custom_train.csv
30
+ valid_data: !ref <save_folder>/custom_valid.csv
31
+ test_data: !ref <save_folder>/custom_test.csv
32
+ skip_prep: False
33
+
34
+
35
+ # Experiment params
36
+ precision: fp32 # bf16, fp16 or fp32
37
+ num_spks: 2 # set to 3 for wsj0-3mix
38
+ noprogressbar: False
39
+ save_audio: True # Save estimated sources on disk
40
+ sample_rate: 16000
41
+
42
+ ####################### Training Parameters ####################################
43
+ N_epochs: 3
44
+ batch_size: 1
45
+ lr: 0.00015
46
+ clip_grad_norm: 5
47
+ loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
48
+ # if True, the training sequences are cut to a specified length
49
+ limit_training_signal_len: False
50
+ # this is the length of sequences if we choose to limit
51
+ # the signal length of training sequences
52
+ training_signal_len: 32000
53
+
54
+ # Set it to True to dynamically create mixtures at training time
55
+ dynamic_mixing: False
56
+
57
+ # Parameters for data augmentation
58
+ use_wavedrop: False
59
+ use_speedperturb: False
60
+ use_rand_shift: False
61
+ min_shift: -8000
62
+ max_shift: 8000
63
+
64
+ # Speed perturbation
65
+ speed_changes: [95, 100, 105] # List of speed changes for time-stretching
66
+
67
+ speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
68
+ orig_freq: !ref <sample_rate>
69
+ speeds: !ref <speed_changes>
70
+
71
+ # Frequency drop: randomly drops a number of frequency bands to zero.
72
+ drop_freq_low: 0 # Min frequency band dropout probability
73
+ drop_freq_high: 1 # Max frequency band dropout probability
74
+ drop_freq_count_low: 1 # Min number of frequency bands to drop
75
+ drop_freq_count_high: 3 # Max number of frequency bands to drop
76
+ drop_freq_width: 0.05 # Width of frequency bands to drop
77
+
78
+ drop_freq: !new:speechbrain.augment.time_domain.DropFreq
79
+ drop_freq_low: !ref <drop_freq_low>
80
+ drop_freq_high: !ref <drop_freq_high>
81
+ drop_freq_count_low: !ref <drop_freq_count_low>
82
+ drop_freq_count_high: !ref <drop_freq_count_high>
83
+ drop_freq_width: !ref <drop_freq_width>
84
+
85
+ # Time drop: randomly drops a number of temporal chunks.
86
+ drop_chunk_count_low: 1 # Min number of audio chunks to drop
87
+ drop_chunk_count_high: 5 # Max number of audio chunks to drop
88
+ drop_chunk_length_low: 1000 # Min length of audio chunks to drop
89
+ drop_chunk_length_high: 2000 # Max length of audio chunks to drop
90
+
91
+ drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
92
+ drop_length_low: !ref <drop_chunk_length_low>
93
+ drop_length_high: !ref <drop_chunk_length_high>
94
+ drop_count_low: !ref <drop_chunk_count_low>
95
+ drop_count_high: !ref <drop_chunk_count_high>
96
+
97
+ # loss thresholding -- this thresholds the training loss
98
+ threshold_byloss: True
99
+ threshold: -30
100
+
101
+ # Encoder parameters
102
+ N_encoder_out: 256
103
+ out_channels: 256
104
+ kernel_size: 16
105
+ kernel_stride: 8
106
+
107
+ # Dataloader options
108
+ # Set num_workers: 0 on MacOS due to behavior of the multiprocessing library
109
+ dataloader_opts:
110
+ batch_size: !ref <batch_size>
111
+ num_workers: 3
112
+
113
+
114
+ # Specifying the network
115
+ Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
116
+ kernel_size: !ref <kernel_size>
117
+ out_channels: !ref <N_encoder_out>
118
+
119
+
120
+ SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
121
+ num_layers: 4
122
+ d_model: !ref <out_channels>
123
+ nhead: 8
124
+ d_ffn: 1024
125
+ dropout: 0
126
+ use_positional_encoding: True
127
+ norm_before: True
128
+
129
+ SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
130
+ num_layers: 4
131
+ d_model: !ref <out_channels>
132
+ nhead: 8
133
+ d_ffn: 1024
134
+ dropout: 0
135
+ use_positional_encoding: True
136
+ norm_before: True
137
+
138
+ MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
139
+ num_spks: !ref <num_spks>
140
+ in_channels: !ref <N_encoder_out>
141
+ out_channels: !ref <out_channels>
142
+ num_layers: 1
143
+ K: 250
144
+ intra_model: !ref <SBtfintra>
145
+ inter_model: !ref <SBtfinter>
146
+ norm: ln
147
+ linear_layer_after_inter_intra: False
148
+ skip_around_intra: True
149
+
150
+ Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
151
+ in_channels: !ref <N_encoder_out>
152
+ out_channels: 1
153
+ kernel_size: !ref <kernel_size>
154
+ stride: !ref <kernel_stride>
155
+ bias: False
156
+
157
+ optimizer: !name:torch.optim.Adam
158
+ lr: !ref <lr>
159
+ weight_decay: 0
160
+
161
+ loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
162
+
163
+ lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
164
+ factor: 0.5
165
+ patience: 2
166
+ dont_halve_until_epoch: 85
167
+
168
+ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
169
+ limit: !ref <N_epochs>
170
+
171
+ modules:
172
+ encoder: !ref <Encoder>
173
+ decoder: !ref <Decoder>
174
+ masknet: !ref <MaskNet>
175
+
176
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
177
+ checkpoints_dir: !ref <save_folder>
178
+ recoverables:
179
+ encoder: !ref <Encoder>
180
+ decoder: !ref <Decoder>
181
+ masknet: !ref <MaskNet>
182
+ counter: !ref <epoch_counter>
183
+ lr_scheduler: !ref <lr_scheduler>
184
+
185
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
186
+ save_file: !ref <train_log>