Spaces:

Maximofn
/

subtify

Build error

Maximofn commited on Feb 11, 2025

Commit

befe577

1 Parent(s): 2a1684e

Remove separe_vocals.py script for audio voice separation

- Delete standalone script for voice separation using SpeechBrain and ModelScope
- Remove functions for audio processing, sample rate conversion, and voice extraction
- Likely consolidating or refactoring voice separation functionality elsewhere

Files changed (1) hide show

separe_vocals.py +0 -164

separe_vocals.py DELETED Viewed

@@ -1,164 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-import soundfile as sf
-import numpy as np
-import os
-import torch
-import argparse
-import speechbrain as sb
-from speechbrain.dataio.dataio import read_audio
-from speechbrain.pretrained import SepformerSeparation as separator
-import torchaudio
-SAMPLE_RATE = 8000
-MODEL_SPEECHBRAIN = "SPEECHBRAIN"
-MODEL_MODELSCOPE = "MODELSCOPE"
-# MODEL = MODEL_SPEECHBRAIN
-MODEL = MODEL_MODELSCOPE
-def get_sample_rate(audio_file_path):
-    """
-    Get the sample rate of an audio file
-    Args:
-        audio_file_path (str): Path to the audio file
-    Returns:
-        int: Sample rate of the audio file
-    """
-    _, sample_rate = sf.read(audio_file_path, always_2d=True)
-    return sample_rate
-def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
-    """
-    Change the sample rate of an audio file
-    Args:
-        input_audio_file_path (str): Path to the input audio file
-        output_audio_file_path (str): Path to the output audio file
-        sample_rate (int): Sample rate to change to
-    """
-    os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')
-def audio_is_stereo(audio_file_path):
-    """
-    Check if an audio file is stereo
-    Args:
-        audio_file_path (str): Path to the audio file
-    Returns:
-        bool: True if the audio file is stereo, False otherwise
-    """
-    audio, _ = sf.read(audio_file_path, always_2d=True)
-    return audio.shape[1] == 2
-def set_mono(input_audio_file_path, output_audio_file_path):
-    """
-    Set an audio file to mono
-    Args:
-        input_audio_file_path (str): Path to the input audio file
-        output_audio_file_path (str): Path to the output audio file
-    """
-    os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
-def write_number_speakers_txt(output_folder, num_speakers):
-    """
-    Write the number of speakers in a txt file
-    Args:
-        output_folder (str): Path to the output folder
-        num_speakers (int): Number of speakers
-    """
-    with open(f"{output_folder}/speakers.txt", 'w') as f:
-        f.write(str(num_speakers))
-def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
-    file, _ = input_audio_file_path.split(".")
-    _, file = file.split("/")
-    est_sources = model.separate_file(path=input_audio_file_path)
-    num_vocals = est_sources.shape[2]
-    speakers = 0
-    for i in range(num_vocals):
-        save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
-        torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
-        speakers += 1
-    # Write number of speakers in a txt file
-    write_number_speakers_txt(output_folder, speakers)
-def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
-    # Get input and output names
-    input_name, _ = input_audio_file_path.split(".")
-    input_folder, input_name = input_name.split("/")
-    # Set input files with 8k sample rate and mono
-    input_8k = f"{input_folder}/{input_name}_8k.wav"
-    input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
-    # Check if input has 8k sample rate, if not, change it
-    sr = get_sample_rate(input_audio_file_path)
-    if sr != SAMPLE_RATE:
-        change_sample_rate(input, input_8k, SAMPLE_RATE)
-        remove_8k = True
-    else:
-        input_8k = input
-        remove_8k = False
-    # Check if input is stereo, if yes, set it to mono
-    if audio_is_stereo(input_8k):
-        set_mono(input_8k, input_8k_mono)
-        remove_mono = True
-    else:
-        input_8k_mono = input_8k
-        remove_mono = False
-    # Separate audio voices
-    result = model(input_8k_mono)
-    # Save separated audio voices
-    speakers = 0
-    for i, signal in enumerate(result['output_pcm_list']):
-        save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
-        sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
-        speakers += 1
-    # Write number of speakers in a txt file
-    write_number_speakers_txt(output_folder, speakers)
-    # Remove temporary files
-    if remove_8k:
-        os.remove(input_8k)
-    if remove_mono:
-        os.remove(input_8k_mono)
-if __name__ == '__main__':
-    argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
-    argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
-    argparser.add_argument('device', type=str, help='Device to use for separation')
-    args = argparser.parse_args()
-    device = args.device
-    if MODEL == MODEL_SPEECHBRAIN:
-        if device == 'cpu':
-            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
-        elif 'cuda' in device:
-            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
-        elif device == 'gpu':
-            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
-        else:
-            raise ValueError(f"Device {device} is not valid")
-    elif MODEL == MODEL_MODELSCOPE:
-        separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
-    else:
-        raise ValueError(f"Model {MODEL} is not valid")
-    # Read files from input file
-    with open(args.inputs_file, 'r') as f:
-        inputs = f.read().splitlines()
-    output_folder = "vocals"
-    for input in inputs:
-        if MODEL == MODEL_SPEECHBRAIN:
-            separate_vocals_speechbrain(input, output_folder, model)
-        elif MODEL == MODEL_MODELSCOPE:
-            separate_vocals_modelscope(input, output_folder, separation)
-        else:
-            raise ValueError(f"Model {MODEL} is not valid")