Remove separe_vocals.py script for audio voice separation
Browse files- Delete standalone script for voice separation using SpeechBrain and ModelScope
- Remove functions for audio processing, sample rate conversion, and voice extraction
- Likely consolidating or refactoring voice separation functionality elsewhere
- separe_vocals.py +0 -164
separe_vocals.py
DELETED
|
@@ -1,164 +0,0 @@
|
|
| 1 |
-
from modelscope.pipelines import pipeline
|
| 2 |
-
from modelscope.utils.constant import Tasks
|
| 3 |
-
import soundfile as sf
|
| 4 |
-
import numpy as np
|
| 5 |
-
import os
|
| 6 |
-
import torch
|
| 7 |
-
import argparse
|
| 8 |
-
import speechbrain as sb
|
| 9 |
-
from speechbrain.dataio.dataio import read_audio
|
| 10 |
-
from speechbrain.pretrained import SepformerSeparation as separator
|
| 11 |
-
import torchaudio
|
| 12 |
-
|
| 13 |
-
SAMPLE_RATE = 8000
|
| 14 |
-
MODEL_SPEECHBRAIN = "SPEECHBRAIN"
|
| 15 |
-
MODEL_MODELSCOPE = "MODELSCOPE"
|
| 16 |
-
# MODEL = MODEL_SPEECHBRAIN
|
| 17 |
-
MODEL = MODEL_MODELSCOPE
|
| 18 |
-
|
| 19 |
-
def get_sample_rate(audio_file_path):
|
| 20 |
-
"""
|
| 21 |
-
Get the sample rate of an audio file
|
| 22 |
-
Args:
|
| 23 |
-
audio_file_path (str): Path to the audio file
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
int: Sample rate of the audio file
|
| 27 |
-
"""
|
| 28 |
-
_, sample_rate = sf.read(audio_file_path, always_2d=True)
|
| 29 |
-
return sample_rate
|
| 30 |
-
|
| 31 |
-
def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
|
| 32 |
-
"""
|
| 33 |
-
Change the sample rate of an audio file
|
| 34 |
-
Args:
|
| 35 |
-
input_audio_file_path (str): Path to the input audio file
|
| 36 |
-
output_audio_file_path (str): Path to the output audio file
|
| 37 |
-
sample_rate (int): Sample rate to change to
|
| 38 |
-
"""
|
| 39 |
-
os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')
|
| 40 |
-
|
| 41 |
-
def audio_is_stereo(audio_file_path):
|
| 42 |
-
"""
|
| 43 |
-
Check if an audio file is stereo
|
| 44 |
-
Args:
|
| 45 |
-
audio_file_path (str): Path to the audio file
|
| 46 |
-
|
| 47 |
-
Returns:
|
| 48 |
-
bool: True if the audio file is stereo, False otherwise
|
| 49 |
-
"""
|
| 50 |
-
audio, _ = sf.read(audio_file_path, always_2d=True)
|
| 51 |
-
return audio.shape[1] == 2
|
| 52 |
-
|
| 53 |
-
def set_mono(input_audio_file_path, output_audio_file_path):
|
| 54 |
-
"""
|
| 55 |
-
Set an audio file to mono
|
| 56 |
-
Args:
|
| 57 |
-
input_audio_file_path (str): Path to the input audio file
|
| 58 |
-
output_audio_file_path (str): Path to the output audio file
|
| 59 |
-
"""
|
| 60 |
-
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
| 61 |
-
|
| 62 |
-
def write_number_speakers_txt(output_folder, num_speakers):
|
| 63 |
-
"""
|
| 64 |
-
Write the number of speakers in a txt file
|
| 65 |
-
Args:
|
| 66 |
-
output_folder (str): Path to the output folder
|
| 67 |
-
num_speakers (int): Number of speakers
|
| 68 |
-
"""
|
| 69 |
-
with open(f"{output_folder}/speakers.txt", 'w') as f:
|
| 70 |
-
f.write(str(num_speakers))
|
| 71 |
-
|
| 72 |
-
def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
|
| 73 |
-
file, _ = input_audio_file_path.split(".")
|
| 74 |
-
_, file = file.split("/")
|
| 75 |
-
|
| 76 |
-
est_sources = model.separate_file(path=input_audio_file_path)
|
| 77 |
-
num_vocals = est_sources.shape[2]
|
| 78 |
-
speakers = 0
|
| 79 |
-
for i in range(num_vocals):
|
| 80 |
-
save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
|
| 81 |
-
torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
|
| 82 |
-
speakers += 1
|
| 83 |
-
|
| 84 |
-
# Write number of speakers in a txt file
|
| 85 |
-
write_number_speakers_txt(output_folder, speakers)
|
| 86 |
-
|
| 87 |
-
def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
|
| 88 |
-
# Get input and output names
|
| 89 |
-
input_name, _ = input_audio_file_path.split(".")
|
| 90 |
-
input_folder, input_name = input_name.split("/")
|
| 91 |
-
|
| 92 |
-
# Set input files with 8k sample rate and mono
|
| 93 |
-
input_8k = f"{input_folder}/{input_name}_8k.wav"
|
| 94 |
-
input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
|
| 95 |
-
|
| 96 |
-
# Check if input has 8k sample rate, if not, change it
|
| 97 |
-
sr = get_sample_rate(input_audio_file_path)
|
| 98 |
-
if sr != SAMPLE_RATE:
|
| 99 |
-
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
| 100 |
-
remove_8k = True
|
| 101 |
-
else:
|
| 102 |
-
input_8k = input
|
| 103 |
-
remove_8k = False
|
| 104 |
-
|
| 105 |
-
# Check if input is stereo, if yes, set it to mono
|
| 106 |
-
if audio_is_stereo(input_8k):
|
| 107 |
-
set_mono(input_8k, input_8k_mono)
|
| 108 |
-
remove_mono = True
|
| 109 |
-
else:
|
| 110 |
-
input_8k_mono = input_8k
|
| 111 |
-
remove_mono = False
|
| 112 |
-
|
| 113 |
-
# Separate audio voices
|
| 114 |
-
result = model(input_8k_mono)
|
| 115 |
-
|
| 116 |
-
# Save separated audio voices
|
| 117 |
-
speakers = 0
|
| 118 |
-
for i, signal in enumerate(result['output_pcm_list']):
|
| 119 |
-
save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
|
| 120 |
-
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
| 121 |
-
speakers += 1
|
| 122 |
-
|
| 123 |
-
# Write number of speakers in a txt file
|
| 124 |
-
write_number_speakers_txt(output_folder, speakers)
|
| 125 |
-
|
| 126 |
-
# Remove temporary files
|
| 127 |
-
if remove_8k:
|
| 128 |
-
os.remove(input_8k)
|
| 129 |
-
if remove_mono:
|
| 130 |
-
os.remove(input_8k_mono)
|
| 131 |
-
|
| 132 |
-
if __name__ == '__main__':
|
| 133 |
-
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
| 134 |
-
argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
|
| 135 |
-
argparser.add_argument('device', type=str, help='Device to use for separation')
|
| 136 |
-
args = argparser.parse_args()
|
| 137 |
-
|
| 138 |
-
device = args.device
|
| 139 |
-
if MODEL == MODEL_SPEECHBRAIN:
|
| 140 |
-
if device == 'cpu':
|
| 141 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
|
| 142 |
-
elif 'cuda' in device:
|
| 143 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
|
| 144 |
-
elif device == 'gpu':
|
| 145 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
|
| 146 |
-
else:
|
| 147 |
-
raise ValueError(f"Device {device} is not valid")
|
| 148 |
-
elif MODEL == MODEL_MODELSCOPE:
|
| 149 |
-
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
| 150 |
-
else:
|
| 151 |
-
raise ValueError(f"Model {MODEL} is not valid")
|
| 152 |
-
|
| 153 |
-
# Read files from input file
|
| 154 |
-
with open(args.inputs_file, 'r') as f:
|
| 155 |
-
inputs = f.read().splitlines()
|
| 156 |
-
|
| 157 |
-
output_folder = "vocals"
|
| 158 |
-
for input in inputs:
|
| 159 |
-
if MODEL == MODEL_SPEECHBRAIN:
|
| 160 |
-
separate_vocals_speechbrain(input, output_folder, model)
|
| 161 |
-
elif MODEL == MODEL_MODELSCOPE:
|
| 162 |
-
separate_vocals_modelscope(input, output_folder, separation)
|
| 163 |
-
else:
|
| 164 |
-
raise ValueError(f"Model {MODEL} is not valid")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|