Spaces:

nachi1326
/

SourceSeparation

Runtime error

App Files Files Community

nachi1326 commited on Apr 12, 2024

Commit

f7942b3

verified ·

1 Parent(s): 99acccf

Upload 5 files

Browse files

Files changed (5) hide show

app.py +65 -0
decoder.pth +3 -0
encoder.pth +3 -0
masknet.pth +3 -0
sepformer-customdataset.yaml +186 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gradio as gr
+import torch
+from hyperpyyaml import load_hyperpyyaml
+import yaml
+from speechbrain.inference.separation import SepformerSeparation
+import sys
+sys.path.append("SOURCESEPARATION")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def separate_audio(mixture):
+    # Convert mixture to tensor
+    print(mixture)
+    encoder_checkpoint = "models/encoder.pth"
+    decoder_checkpoint = "models/decoder.pth"
+    masknet_checkpoint = "models/masknet.pth"
+    encoder = torch.load(encoder_checkpoint, map_location=device)
+    decoder = torch.load(decoder_checkpoint, map_location=device)
+    masknet = torch.load(masknet_checkpoint, map_location=device)
+    # Load model
+    # Step 2: Load Hyperparameters
+    data_folder = "."
+    # hparams_file = "data/yamls/sepformer-customdataset.yaml"
+    overrides = f"data_folder: {data_folder}\noutput_folder: "
+    hyperparams_file = "yamls/sepformer-customdataset.yaml"
+    with open(hyperparams_file, "r") as f:
+        hparams = load_hyperpyyaml(f, overrides)
+    hparams['Encoder'].load_state_dict(encoder)
+    hparams['Decoder'].load_state_dict(decoder)
+    hparams['MaskNet'].load_state_dict(masknet)  #
+    separator = SepformerSeparation(
+        modules=hparams["modules"],
+        hparams=hparams
+    )
+    _, mixture = torch.tensor(mixture)
+    est_sources = separator.separate_batch(mixture)
+    s1 = est_sources[:, :, 0].cpu()
+    s2 = est_sources[:, :, 1].cpu()
+    # Return separated sources
+    return [(16000,s1), (16000,s2)]
+# Define the audio input component
+input_audio = gr.Audio(sources=["upload"], waveform_options=dict(waveform_color="#01C6FF"))
+# Define the audio output components (one for each processed stream)
+output_audio1 = gr.Audio(autoplay=False)
+output_audio2 = gr.Audio(autoplay=False)
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=separate_audio,
+    inputs=input_audio,
+    outputs=[output_audio1, output_audio2],
+    title="Source Separation"
+)
+interface.launch()

decoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5bb3ff7438b5c524f804865428b3ace9dcbf9e237484ef62423ece9bd7d3cef
+size 17628

encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2b1ba5da43d6d814304a576dccd34df365aaec5e84f15778a1b96a33ab0b4de
+size 17692

masknet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd1eafc33bf985e80d9a054715fb4ff30581dfce8ab69379be3e5479ce8d395b
+size 32028552

sepformer-customdataset.yaml ADDED Viewed

	@@ -0,0 +1,186 @@

+# ################################
+# Model: SepFormer for source separation
+# https://arxiv.org/abs/2010.13154
+# Dataset : Custom dataset
+# ################################
+#
+# Basic parameters
+# Seed needs to be set at top of yaml, before objects with parameters are made
+#
+seed: 1234
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+# Data params
+# e.g. '/yourpath/wsj0-mix/2speakers'
+# end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
+data_folder: !PLACEHOLDER
+# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
+# e.g. /yourpath/wsj0-processed/si_tr_s/
+# you need to convert the original wsj0 to 8k
+# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
+base_folder_dm: /yourpath/wsj0-processed/si_tr_s/
+experiment_name: sepformer-custom
+output_folder: !ref results/<experiment_name>/<seed>
+train_log: !ref <output_folder>/train_log.txt
+save_folder: !ref <output_folder>/save
+train_data: !ref <save_folder>/custom_train.csv
+valid_data: !ref <save_folder>/custom_valid.csv
+test_data: !ref <save_folder>/custom_test.csv
+skip_prep: False
+# Experiment params
+precision: fp32 # bf16, fp16 or fp32
+num_spks: 2 # set to 3 for wsj0-3mix
+noprogressbar: False
+save_audio: True # Save estimated sources on disk
+sample_rate: 16000
+####################### Training Parameters ####################################
+N_epochs: 3
+batch_size: 1
+lr: 0.00015
+clip_grad_norm: 5
+loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
+# if True, the training sequences are cut to a specified length
+limit_training_signal_len: False
+# this is the length of sequences if we choose to limit
+# the signal length of training sequences
+training_signal_len: 32000
+# Set it to True to dynamically create mixtures at training time
+dynamic_mixing: False
+# Parameters for data augmentation
+use_wavedrop: False
+use_speedperturb: False
+use_rand_shift: False
+min_shift: -8000
+max_shift: 8000
+# Speed perturbation
+speed_changes: [95, 100, 105]  # List of speed changes for time-stretching
+speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
+    orig_freq: !ref <sample_rate>
+    speeds: !ref <speed_changes>
+# Frequency drop: randomly drops a number of frequency bands to zero.
+drop_freq_low: 0  # Min frequency band dropout probability
+drop_freq_high: 1  # Max frequency band dropout probability
+drop_freq_count_low: 1  # Min number of frequency bands to drop
+drop_freq_count_high: 3  # Max number of frequency bands to drop
+drop_freq_width: 0.05  # Width of frequency bands to drop
+drop_freq: !new:speechbrain.augment.time_domain.DropFreq
+    drop_freq_low: !ref <drop_freq_low>
+    drop_freq_high: !ref <drop_freq_high>
+    drop_freq_count_low: !ref <drop_freq_count_low>
+    drop_freq_count_high: !ref <drop_freq_count_high>
+    drop_freq_width: !ref <drop_freq_width>
+# Time drop: randomly drops a number of temporal chunks.
+drop_chunk_count_low: 1  # Min number of audio chunks to drop
+drop_chunk_count_high: 5  # Max number of audio chunks to drop
+drop_chunk_length_low: 1000  # Min length of audio chunks to drop
+drop_chunk_length_high: 2000  # Max length of audio chunks to drop
+drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
+    drop_length_low: !ref <drop_chunk_length_low>
+    drop_length_high: !ref <drop_chunk_length_high>
+    drop_count_low: !ref <drop_chunk_count_low>
+    drop_count_high: !ref <drop_chunk_count_high>
+# loss thresholding -- this thresholds the training loss
+threshold_byloss: True
+threshold: -30
+# Encoder parameters
+N_encoder_out: 256
+out_channels: 256
+kernel_size: 16
+kernel_stride: 8
+# Dataloader options
+# Set num_workers: 0 on MacOS due to behavior of the multiprocessing library
+dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: 3
+# Specifying the network
+Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
+    kernel_size: !ref <kernel_size>
+    out_channels: !ref <N_encoder_out>
+SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+    num_layers: 4
+    d_model: !ref <out_channels>
+    nhead: 8
+    d_ffn: 1024
+    dropout: 0
+    use_positional_encoding: True
+    norm_before: True
+SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+    num_layers: 4
+    d_model: !ref <out_channels>
+    nhead: 8
+    d_ffn: 1024
+    dropout: 0
+    use_positional_encoding: True
+    norm_before: True
+MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
+    num_spks: !ref <num_spks>
+    in_channels: !ref <N_encoder_out>
+    out_channels: !ref <out_channels>
+    num_layers: 1
+    K: 250
+    intra_model: !ref <SBtfintra>
+    inter_model: !ref <SBtfinter>
+    norm: ln
+    linear_layer_after_inter_intra: False
+    skip_around_intra: True
+Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
+    in_channels: !ref <N_encoder_out>
+    out_channels: 1
+    kernel_size: !ref <kernel_size>
+    stride: !ref <kernel_stride>
+    bias: False
+optimizer: !name:torch.optim.Adam
+    lr: !ref <lr>
+    weight_decay: 0
+loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
+lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
+    factor: 0.5
+    patience: 2
+    dont_halve_until_epoch: 85
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <N_epochs>
+modules:
+    encoder: !ref <Encoder>
+    decoder: !ref <Decoder>
+    masknet: !ref <MaskNet>
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        encoder: !ref <Encoder>
+        decoder: !ref <Decoder>
+        masknet: !ref <MaskNet>
+        counter: !ref <epoch_counter>
+        lr_scheduler: !ref <lr_scheduler>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>