Into separe_vocals.py set modelscope and speechbrain methods
Browse files- separe_vocals.py +75 -21
separe_vocals.py
CHANGED
|
@@ -5,8 +5,16 @@ import numpy as np
|
|
| 5 |
import os
|
| 6 |
import torch
|
| 7 |
import argparse
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
SAMPLE_RATE = 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def get_sample_rate(audio_file_path):
|
| 12 |
"""
|
|
@@ -51,27 +59,42 @@ def set_mono(input_audio_file_path, output_audio_file_path):
|
|
| 51 |
"""
|
| 52 |
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
| 53 |
|
| 54 |
-
def
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
|
|
|
| 59 |
# Get input and output names
|
| 60 |
-
input_name =
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
# Get folder of output file
|
| 64 |
-
input_folder = input_name.split("/")[0]
|
| 65 |
-
output_folder = "vocals"
|
| 66 |
-
input_file_name = input_name.split("/")[1]
|
| 67 |
-
output_file_name = output_name.split("/")[1]
|
| 68 |
|
| 69 |
# Set input files with 8k sample rate and mono
|
| 70 |
-
input_8k = f"{input_name}_8k.wav"
|
| 71 |
-
input_8k_mono = f"{input_name}_8k_mono.wav"
|
| 72 |
|
| 73 |
# Check if input has 8k sample rate, if not, change it
|
| 74 |
-
sr = get_sample_rate(
|
| 75 |
if sr != SAMPLE_RATE:
|
| 76 |
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
| 77 |
remove_8k = True
|
|
@@ -88,14 +111,17 @@ def main(args):
|
|
| 88 |
remove_mono = False
|
| 89 |
|
| 90 |
# Separate audio voices
|
| 91 |
-
|
| 92 |
-
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
| 93 |
-
result = separation(input_8k_mono)
|
| 94 |
|
| 95 |
# Save separated audio voices
|
|
|
|
| 96 |
for i, signal in enumerate(result['output_pcm_list']):
|
| 97 |
-
save_file = f'{output_folder}/{
|
| 98 |
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Remove temporary files
|
| 101 |
if remove_8k:
|
|
@@ -105,6 +131,34 @@ def main(args):
|
|
| 105 |
|
| 106 |
if __name__ == '__main__':
|
| 107 |
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
| 108 |
-
argparser.add_argument('
|
|
|
|
| 109 |
args = argparser.parse_args()
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import os
|
| 6 |
import torch
|
| 7 |
import argparse
|
| 8 |
+
import speechbrain as sb
|
| 9 |
+
from speechbrain.dataio.dataio import read_audio
|
| 10 |
+
from speechbrain.pretrained import SepformerSeparation as separator
|
| 11 |
+
import torchaudio
|
| 12 |
|
| 13 |
SAMPLE_RATE = 8000
|
| 14 |
+
MODEL_SPEECHBRAIN = "SPEECHBRAIN"
|
| 15 |
+
MODEL_MODELSCOPE = "MODELSCOPE"
|
| 16 |
+
# MODEL = MODEL_SPEECHBRAIN
|
| 17 |
+
MODEL = MODEL_MODELSCOPE
|
| 18 |
|
| 19 |
def get_sample_rate(audio_file_path):
|
| 20 |
"""
|
|
|
|
| 59 |
"""
|
| 60 |
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
| 61 |
|
| 62 |
+
def write_number_speakers_txt(output_folder, num_speakers):
|
| 63 |
+
"""
|
| 64 |
+
Write the number of speakers in a txt file
|
| 65 |
+
Args:
|
| 66 |
+
output_folder (str): Path to the output folder
|
| 67 |
+
num_speakers (int): Number of speakers
|
| 68 |
+
"""
|
| 69 |
+
with open(f"{output_folder}/speakers.txt", 'w') as f:
|
| 70 |
+
f.write(str(num_speakers))
|
| 71 |
+
|
| 72 |
+
def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
|
| 73 |
+
file, _ = input_audio_file_path.split(".")
|
| 74 |
+
_, file = file.split("/")
|
| 75 |
+
|
| 76 |
+
est_sources = model.separate_file(path=input_audio_file_path)
|
| 77 |
+
num_vocals = est_sources.shape[2]
|
| 78 |
+
speakers = 0
|
| 79 |
+
for i in range(num_vocals):
|
| 80 |
+
save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
|
| 81 |
+
torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
|
| 82 |
+
speakers += 1
|
| 83 |
+
|
| 84 |
+
# Write number of speakers in a txt file
|
| 85 |
+
write_number_speakers_txt(output_folder, speakers)
|
| 86 |
|
| 87 |
+
def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
|
| 88 |
# Get input and output names
|
| 89 |
+
input_name, _ = input_audio_file_path.split(".")
|
| 90 |
+
input_folder, input_name = input_name.split("/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Set input files with 8k sample rate and mono
|
| 93 |
+
input_8k = f"{input_folder}/{input_name}_8k.wav"
|
| 94 |
+
input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
|
| 95 |
|
| 96 |
# Check if input has 8k sample rate, if not, change it
|
| 97 |
+
sr = get_sample_rate(input_audio_file_path)
|
| 98 |
if sr != SAMPLE_RATE:
|
| 99 |
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
| 100 |
remove_8k = True
|
|
|
|
| 111 |
remove_mono = False
|
| 112 |
|
| 113 |
# Separate audio voices
|
| 114 |
+
result = model(input_8k_mono)
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# Save separated audio voices
|
| 117 |
+
speakers = 0
|
| 118 |
for i, signal in enumerate(result['output_pcm_list']):
|
| 119 |
+
save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
|
| 120 |
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
| 121 |
+
speakers += 1
|
| 122 |
+
|
| 123 |
+
# Write number of speakers in a txt file
|
| 124 |
+
write_number_speakers_txt(output_folder, speakers)
|
| 125 |
|
| 126 |
# Remove temporary files
|
| 127 |
if remove_8k:
|
|
|
|
| 131 |
|
| 132 |
if __name__ == '__main__':
|
| 133 |
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
| 134 |
+
argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
|
| 135 |
+
argparser.add_argument('device', type=str, help='Device to use for separation')
|
| 136 |
args = argparser.parse_args()
|
| 137 |
+
|
| 138 |
+
device = args.device
|
| 139 |
+
if MODEL == MODEL_SPEECHBRAIN:
|
| 140 |
+
if device == 'cpu':
|
| 141 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
|
| 142 |
+
elif 'cuda' in device:
|
| 143 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
|
| 144 |
+
elif device == 'gpu':
|
| 145 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
|
| 146 |
+
else:
|
| 147 |
+
raise ValueError(f"Device {device} is not valid")
|
| 148 |
+
elif MODEL == MODEL_MODELSCOPE:
|
| 149 |
+
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
| 150 |
+
else:
|
| 151 |
+
raise ValueError(f"Model {MODEL} is not valid")
|
| 152 |
+
|
| 153 |
+
# Read files from input file
|
| 154 |
+
with open(args.inputs_file, 'r') as f:
|
| 155 |
+
inputs = f.read().splitlines()
|
| 156 |
+
|
| 157 |
+
output_folder = "vocals"
|
| 158 |
+
for input in inputs:
|
| 159 |
+
if MODEL == MODEL_SPEECHBRAIN:
|
| 160 |
+
separate_vocals_speechbrain(input, output_folder, model)
|
| 161 |
+
elif MODEL == MODEL_MODELSCOPE:
|
| 162 |
+
separate_vocals_modelscope(input, output_folder, separation)
|
| 163 |
+
else:
|
| 164 |
+
raise ValueError(f"Model {MODEL} is not valid")
|